ir_datasets
: GOVGOV web document collection. Used for early TREC Web Tracks. Not to be confused with gov2.
The dataset is obtained for a fee from UoG, and is shipped as a hard drive. More information is provided here.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
The TREC Web Track 2002 ad-hoc ranking benchmark.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2002')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, description, narrative>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2002')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2002')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Web Track 2002 named page ranking benchmark.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2002/named-page')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2002/named-page')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
1 | Name refers to this page |
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2002/named-page')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Web Track 2003 ad-hoc ranking benchmark.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2003')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, description>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2003')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2003')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Web Track 2003 named page ranking benchmark.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2003/named-page')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2003/named-page')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
1 | Name refers to this page |
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2003/named-page')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
The TREC Web Track 2004 ad-hoc ranking benchmark.
Queries include a combination of topic distillation, homepage finding, and named page finding.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2004')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2004')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, http_headers, body, body_content_type>
Relevance levels
Rel. | Definition |
---|---|
0 | Not Relevant |
1 | Relevant |
Example
import ir_datasets
dataset = ir_datasets.load('gov/trec-web-2004')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>