Source code for lightning_ir.data.data

  1"""
  2Basic sample classes for Lightning IR.
  3
  4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
  5into batches for processing.
  6"""
  7
  8from dataclasses import dataclass
  9from typing import Any, Dict, List, Sequence
 10
 11import torch
 12from ir_datasets.formats.base import GenericDoc, GenericQuery
 13
 14
[docs] 15@dataclass 16class RankSample: 17 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels. 18 19 :param query_id: Id of the query 20 :type query_id: str 21 :param query: Query text 22 :type query_id: str 23 :param doc_ids: List of document ids 24 :type doc_ids: Sequence[str] 25 :param docs: List of document texts 26 :type docs: Sequence[str] 27 :param targets: Optional list of target labels denoting the relevane of a document for the query 28 :type targets: torch.Tensor, optional 29 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 30 """ 31 32 query_id: str 33 query: str 34 doc_ids: Sequence[str] 35 docs: Sequence[str] 36 targets: torch.Tensor | None = None 37 qrels: List[Dict[str, Any]] | None = None
38 39
[docs] 40@dataclass 41class QuerySample: 42 """A sample of query data containing a query and its id. 43 44 :param query_id: Id of the query 45 :type query_id: str 46 :param query: Query text 47 :type query_id: str 48 """ 49 50 query_id: str 51 query: str 52
[docs] 53 @classmethod 54 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample": 55 """Create a QuerySample from a an ir_datasets sample. 56 57 :param sample: ir_datasets sample 58 :type sample: GenericQuery 59 :return: Query sample 60 :rtype: QuerySample 61 """ 62 return cls(sample[0], sample[1])
63 64
[docs] 65@dataclass 66class DocSample: 67 """A sample of document data containing a document and its id. 68 69 :param doc_id: Id of the document 70 :type doc_id: str 71 :param doc: Document text 72 :type doc 73 """ 74 75 doc_id: str 76 doc: str 77
[docs] 78 @classmethod 79 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample": 80 """Create a DocSample from an ir_datasets sample. 81 82 :param sample: ir_datasets sample 83 :type sample: GenericDoc 84 :param text_fields: Optional fields to parse the text. If None uses the samples ``default_text()`` 85 defaults to None 86 :type text_fields: Sequence[str] | None, optional 87 :return: Doc sample 88 :rtype: DocSample 89 """ 90 if text_fields is not None: 91 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields)) 92 return cls(sample[0], sample.default_text())
93 94
[docs] 95@dataclass 96class RankBatch: 97 """A batch of ranking data combining multiple :py:class:`.RankSample` instances 98 99 :param queries: List of query texts 100 :type queries: Sequence[str] 101 :param docs: List of list of document texts 102 :type docs: Sequence[Sequence[str]] 103 :param query_ids: Optional list of query ids 104 :type query_ids: Sequence[str], optional 105 :param doc_ids: Optional list of list of document ids 106 :type doc_ids: Sequence[Sequence[str]], optional 107 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 108 :type qrels: List[Dict[str, Any]], optional 109 """ 110 111 queries: Sequence[str] 112 docs: Sequence[Sequence[str]] 113 query_ids: Sequence[str] | None = None 114 doc_ids: Sequence[Sequence[str]] | None = None 115 qrels: List[Dict[str, int]] | None = None
116 117
[docs] 118@dataclass 119class TrainBatch(RankBatch): 120 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances 121 122 :param queries: List of query texts 123 :type queries: Sequence[str] 124 :param docs: List of list of document texts 125 :type docs: Sequence[Sequence[str]] 126 :param query_ids: Optional list of query ids 127 :type query_ids: Sequence[str], optional 128 :param doc_ids: Optional list of list of document ids 129 :type doc_ids: Sequence[Sequence[str]], optional 130 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 131 :type qrels: List[Dict[str, Any]], optional 132 :param targets: Optional list of target labels denoting the relevane of a document for the query 133 :type targets: torch.Tensor, optional 134 """ 135 136 targets: torch.Tensor | None = None
137 138
[docs] 139@dataclass 140class IndexBatch: 141 """A batch of index that combines multiple :py:class:`.DocSample` instances 142 143 :param doc_ids: List of document ids 144 :type doc_ids: Sequence[str] 145 :param docs: List of document texts 146 :type docs: Sequence[str] 147 """ 148 149 doc_ids: Sequence[str] 150 docs: Sequence[str]
151 152
[docs] 153@dataclass 154class SearchBatch: 155 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids 156 and qrels. 157 158 :param query_ids: List of query ids 159 :type query_ids: Sequence[str] 160 :param queries: List of query texts 161 :type queries: Sequence[str] 162 :param doc_ids: Optional list of list of document ids 163 :type doc_ids: Sequence[Sequence[str]], optional 164 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 165 :type qrels: List[Dict[str, Any]], optional 166 """ 167 168 query_ids: Sequence[str] 169 queries: Sequence[str] 170 doc_ids: Sequence[Sequence[str]] | None = None 171 qrels: List[Dict[str, int]] | None = None