ir_datasets
: NFCorpus (NutritionFacts)"NFCorpus is a full-text English retrieval data set for Medical Information Retrieval. It contains a total of 3,244 natural language queries (written in non-technical English, harvested from the NutritionFacts.org site) with 169,756 automatically extracted relevance judgments for 9,964 medical documents (written in a complex terminology-heavy language), mostly from PubMed."
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Official dev set. Queries include both title and combinted "all" text field (titles, descriptions, topics, transcripts and comments)
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, all>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official dev set, filtered to exclude queries from topic pages.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev/nontopic')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev/nontopic')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev/nontopic')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official dev set, filtered to only include queries from video pages.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev/video')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, desc>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev/video')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/dev/video')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official test set. Queries include both title and combinted "all" text field (titles, descriptions, topics, transcripts and comments)
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, all>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official test set, filtered to exclude queries from topic pages.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test/nontopic')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test/nontopic')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test/nontopic')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official test set, filtered to only include queries from video pages.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test/video')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, desc>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test/video')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/test/video')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official train set. Queries include both title and combinted "all" text field (titles, descriptions, topics, transcripts and comments)
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, all>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official train set, filtered to exclude queries from topic pages.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train/nontopic')
for query in dataset.queries_iter():
query # namedtuple<query_id, text>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train/nontopic')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train/nontopic')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>
Official train set, filtered to only include queries from video pages.
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train/video')
for query in dataset.queries_iter():
query # namedtuple<query_id, title, desc>
Language: en
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train/video')
for doc in dataset.docs_iter():
doc # namedtuple<doc_id, url, title, abstract>
Relevance levels
Rel. | Definition |
---|---|
0 | Marginally relevant, based on topic containment. |
1 | A link exists from the query to another query that directly links to the document. |
2 | A direct link from the query to the document the cited sources section of a page. |
Example
import ir_datasets
dataset = ir_datasets.load('nfcorpus/train/video')
for qrel in dataset.qrels_iter():
qrel # namedtuple<query_id, doc_id, relevance, iteration>