Source code for lightning_ir.data.data
1"""
2Basic sample classes for Lightning IR.
3
4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
5into batches for processing.
6"""
7
8from dataclasses import dataclass
9from typing import Any, Dict, List, Sequence
10
11import torch
12from ir_datasets.formats.base import GenericDoc, GenericQuery
13
14
[docs]
15@dataclass
16class RankSample:
17 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels.
18
19 :param query_id: Id of the query
20 :type query_id: str
21 :param query: Query text
22 :type query_id: str
23 :param doc_ids: List of document ids
24 :type doc_ids: Sequence[str]
25 :param docs: List of document texts
26 :type docs: Sequence[str]
27 :param targets: Optional list of target labels denoting the relevane of a document for the query
28 :type targets: torch.Tensor, optional
29 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
30 """
31
32 query_id: str
33 query: str
34 doc_ids: Sequence[str]
35 docs: Sequence[str]
36 targets: torch.Tensor | None = None
37 qrels: List[Dict[str, Any]] | None = None
38
39
[docs]
40@dataclass
41class QuerySample:
42 """A sample of query data containing a query and its id.
43
44 :param query_id: Id of the query
45 :type query_id: str
46 :param query: Query text
47 :type query_id: str
48 """
49
50 query_id: str
51 query: str
52
[docs]
53 @classmethod
54 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample":
55 """Create a QuerySample from a an ir_datasets sample.
56
57 :param sample: ir_datasets sample
58 :type sample: GenericQuery
59 :return: Query sample
60 :rtype: QuerySample
61 """
62 return cls(sample[0], sample[1])
63
64
[docs]
65@dataclass
66class DocSample:
67 """A sample of document data containing a document and its id.
68
69 :param doc_id: Id of the document
70 :type doc_id: str
71 :param doc: Document text
72 :type doc
73 """
74
75 doc_id: str
76 doc: str
77
[docs]
78 @classmethod
79 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample":
80 """Create a DocSample from an ir_datasets sample.
81
82 :param sample: ir_datasets sample
83 :type sample: GenericDoc
84 :param text_fields: Optional fields to parse the text. If None uses the samples ``default_text()``
85 defaults to None
86 :type text_fields: Sequence[str] | None, optional
87 :return: Doc sample
88 :rtype: DocSample
89 """
90 if text_fields is not None:
91 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields))
92 return cls(sample[0], sample.default_text())
93
94
[docs]
95@dataclass
96class RankBatch:
97 """A batch of ranking data combining multiple :py:class:`.RankSample` instances
98
99 :param queries: List of query texts
100 :type queries: Sequence[str]
101 :param docs: List of list of document texts
102 :type docs: Sequence[Sequence[str]]
103 :param query_ids: Optional list of query ids
104 :type query_ids: Sequence[str], optional
105 :param doc_ids: Optional list of list of document ids
106 :type doc_ids: Sequence[Sequence[str]], optional
107 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
108 :type qrels: List[Dict[str, Any]], optional
109 """
110
111 queries: Sequence[str]
112 docs: Sequence[Sequence[str]]
113 query_ids: Sequence[str] | None = None
114 doc_ids: Sequence[Sequence[str]] | None = None
115 qrels: List[Dict[str, int]] | None = None
116
117
[docs]
118@dataclass
119class TrainBatch(RankBatch):
120 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances
121
122 :param queries: List of query texts
123 :type queries: Sequence[str]
124 :param docs: List of list of document texts
125 :type docs: Sequence[Sequence[str]]
126 :param query_ids: Optional list of query ids
127 :type query_ids: Sequence[str], optional
128 :param doc_ids: Optional list of list of document ids
129 :type doc_ids: Sequence[Sequence[str]], optional
130 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
131 :type qrels: List[Dict[str, Any]], optional
132 :param targets: Optional list of target labels denoting the relevane of a document for the query
133 :type targets: torch.Tensor, optional
134 """
135
136 targets: torch.Tensor | None = None
137
138
[docs]
139@dataclass
140class IndexBatch:
141 """A batch of index that combines multiple :py:class:`.DocSample` instances
142
143 :param doc_ids: List of document ids
144 :type doc_ids: Sequence[str]
145 :param docs: List of document texts
146 :type docs: Sequence[str]
147 """
148
149 doc_ids: Sequence[str]
150 docs: Sequence[str]
151
152
[docs]
153@dataclass
154class SearchBatch:
155 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids
156 and qrels.
157
158 :param query_ids: List of query ids
159 :type query_ids: Sequence[str]
160 :param queries: List of query texts
161 :type queries: Sequence[str]
162 :param doc_ids: Optional list of list of document ids
163 :type doc_ids: Sequence[Sequence[str]], optional
164 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
165 :type qrels: List[Dict[str, Any]], optional
166 """
167
168 query_ids: Sequence[str]
169 queries: Sequence[str]
170 doc_ids: Sequence[Sequence[str]] | None = None
171 qrels: List[Dict[str, int]] | None = None