ir_datasets
: GOV2GOV2 web document collection. Used for the TREC Terabyte Track.
The dataset is obtained for a fee from UoG, and is shipped as a hard drive. More information is provided here.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
TREC 2007 Million Query track.
Language: multiple/other/unknown
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-mq-2007')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-mq-2007')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-mq-2007')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, method, iprob>
TREC 2008 Million Query track.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-mq-2008')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-mq-2008')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-mq-2008')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, method, iprob>
The TREC Terabyte Track 2004 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2004')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, description, narrative>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2004')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2004')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Terabyte Track 2005 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, description, narrative>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Terabyte Track 2005 efficiency ranking benchmark. Contains 50,000 queries from a search engine, including the 50 topics from gov2/trec-tb-2005. Only the 50 topics have judgments.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005/efficiency')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005/efficiency')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005/efficiency')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Terabyte Track 2005 named page ranking benchmark. Contains 252 queries with titles that resemble bookmark labels. Relevance judgments include near-duplicate pages and other pages that may satisfy the bookmark label.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005/named-page')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005/named-page')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2005/named-page')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Terabyte Track 2006 ad-hoc ranking benchmark. Contains 50 queries with deep relevance judgments.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, description, narrative>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Terabyte Track 2006 efficiency ranking benchmark. Contains 100,000 queries from a search engine, including the 50 topics from gov2/trec-tb-2006. Only the 50 topics have judgments.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Small stream from gov2/trec-tb-2006/efficiency, with 10,000 queries.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/10k')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/10k')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Stream 1 of gov2/trec-tb-2006/efficiency (25,000 queries).
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream1')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream1')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Stream 2 of gov2/trec-tb-2006/efficiency (25,000 queries).
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream2')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream2')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Stream 3 of gov2/trec-tb-2006/efficiency (25,000 queries).
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream3')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream3')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
2 | Highly Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream3')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Stream 4 of gov2/trec-tb-2006/efficiency (25,000 queries).
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream4')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/efficiency/stream4')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
The TREC Terabyte Track 2006 named page ranking benchmark. Contains 181 queries with titles that resemble bookmark labels. Relevance judgments include near-duplicate pages and other pages that may satisfy the bookmark label.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/named-page')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/named-page')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov2/trec-tb-2006/named-page')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>