Coverage for /usr/local/lib/python3.10/site-packages/tgclients/search.py: 86%
69 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-02 16:49 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-02 16:49 +0000
1# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen
2#
3# SPDX-License-Identifier: LGPL-3.0-or-later
5"""TextGrid Search API."""
6import logging
7from io import BytesIO
8from typing import Optional, List
10import requests
11from requests.models import Response
12from xsdata.formats.dataclass.context import XmlContext
13from xsdata.formats.dataclass.parsers import XmlParser
15from tgclients.config import TextgridConfig
16from tgclients.databinding.tgsearch import Response as SearchResponse, TextgridUris
18logger = logging.getLogger(__name__)
21class TextgridSearchRequest:
22 """Provide low level access to the TextGrid search service, returning the response objects."""
24 def __init__(self, config: TextgridConfig = TextgridConfig(), nonpublic: bool = False) -> None:
25 if nonpublic: 25 ↛ 26line 25 didn't jump to line 26, because the condition on line 25 was never true
26 self._url = config.search
27 else:
28 self._url = config.search_public
29 self._config = config
30 # reuse tcp connections: https://requests.readthedocs.io/en/latest/user/advanced/#session-objects
31 self._requests = requests.Session()
33 def info(self, textgrid_uri: str, sid: Optional[str] = None) -> Response:
34 """Retrieve metadata for a textgrid object specified by its
35 textgrid-uri
37 Args:
38 textgrid_uri (str): Textgrid URI
39 sid (Optional[str]): Session ID. Defaults to None.
41 Raises:
42 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
44 Returns:
45 Response: metadata for uri
46 """
47 url = self._url + '/info/'
48 response = self._requests.get(
49 url + textgrid_uri, params={'sid': sid}, timeout=self._config.http_timeout)
50 return self._handle_response(response)
52 def list_project_root(self, project_id: str, sid: Optional[str] = None) -> Response:
53 """Get objects belonging to a project, filtered by objects that are in
54 an aggregation in the same project.
56 Args:
57 project_id (str): the ID of the project to list
58 sid (Optional[str], optional): Session ID. Defaults to None.
60 Raises:
61 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
63 Returns:
64 Response: HTTP response from service, containing a list of textgrid metadata entries
65 """
66 response = self._requests.get(
67 self._url + '/navigation/' + project_id, params={'sid': sid},
68 timeout=self._config.http_timeout)
69 return self._handle_response(response)
71 def list_aggregation(self, textgrid_uri: str, sid: Optional[str] = None) -> Response:
72 """Get child resources of an aggregation.
74 Args:
75 textgrid_uri (str): Textgrid URI
76 sid (Optional[str], optional): Session ID. Defaults to None.
78 Raises:
79 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
81 Returns:
82 Response: HTTP response from service, containing a list of textgrid metadata entries
83 """
84 response = self._requests.get(self._url + '/navigation/agg/' +
85 textgrid_uri, params={'sid': sid},
86 timeout=self._config.http_timeout)
87 return self._handle_response(response)
89 # pylint: disable-msg=too-many-arguments,too-many-locals
90 def search(self,
91 query: Optional[str] = '*',
92 sid: Optional[str] = None,
93 target: Optional[str] = None,
94 order: Optional[str] = None,
95 start: Optional[int] = None,
96 limit: Optional[int] = None,
97 kwic_width: Optional[int] = None,
98 word_distance: Optional[int] = None,
99 path: Optional[bool] = None,
100 all_projects: Optional[bool] = None,
101 sandbox: Optional[bool] = None,
102 filters: Optional[List[str]] = None,
103 facet: Optional[List[str]] = None,
104 facet_limit: Optional[int] = None) -> Response:
105 """Run fulltext queries or filters on TextGrid metadata and fulltext objects.
107 Please note: as the defaults of this function are mostly set to None, the defaults from
108 the service are used, and also noted in this docstring. see:
109 http://textgridlab.org/doc/services/submodules/tg-search/docs/api/search.html
111 Args:
112 query (str, optional): Lucene search string. Defaults to '*'.
113 sid (str, optional): TextGrid SessionID from tgauth. Defaults to None.
114 target (str, optional): where to do fulltext-searches: one of 'structure',
115 'metadata' and 'both'. Defaults to 'both'.
116 order (str, optional): key-value ascending (asc) or descending (desc) and metadata-field
117 like asc:title or desc:author. Defaults to 'relevance'.
118 start (int, optional): result number to start with.. Defaults to 0.
119 limit (int, optional): number of entries to return.. Defaults to 20.
120 kwic_width (int, optional): number of chars before and after a kwic match.
121 Defaults to 40.
122 word_distance (int, optional): max distance beetween two words in fulltext query.
123 ignored if set to a number < 0, then for a hit all words
124 must be contained in one document. Defaults to -1.
125 path (boo], optional): path of found result(work->edition->aggregations) should be
126 applied to hit. Defaults to false.
127 all_projects (bool, optional): all Projects should be searched for public data,
128 warning: this query may be slow, if many results found.
129 Defaults to false.
130 sandbox (bool, optional): show sandboxed (not yet finally published) data.
131 Defaults to false.
132 filters (List[str], optional): add filter on query results, e.g. for faceting.
133 Defaults to None.
134 facet (List[str], optional): get facets for query results. Defaults to None.
135 facet_limit (int, optional): number of results to return for each facet. Defaults to 10.
137 Raises:
138 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
140 Returns:
141 Response: HTTP response from service - a list of textgrid metadata entries,
142 KWIC hits, paths and facets if requested
143 """
145 params = {
146 'q': query,
147 'sid': sid,
148 'target': target,
149 'order': order,
150 'start': start,
151 'limit': limit,
152 'kwicWidth': kwic_width,
153 'wordDistance': word_distance,
154 'path': path,
155 'allProjects': all_projects,
156 'sandbox': sandbox,
157 'filter': filters,
158 'facet': facet,
159 'facetLimit': facet_limit
160 }
161 response = self._requests.get(self._url + '/search', params=params,
162 timeout=self._config.http_timeout)
163 return self._handle_response(response)
164 # pylint: enable-msg=too-many-arguments,too-many-locals
166 def edition_work_metadata_for(self, textgrid_uri: str, sid: Optional[str] = None) -> Response:
167 """Find parent edition for an object and the edition and work metadata
169 Args:
170 textgrid_uri (str): Textgrid URI
171 sid (Optional[str], optional): Session ID. Defaults to None.
173 Raises:
174 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
176 Returns:
177 Response: HTTP response from service - edition and work metadata for given object
178 from first matching parent edition
179 """
180 response = self._requests.get(
181 self._url + '/info/' + textgrid_uri +
182 '/editionWorkMeta', params={'sid': sid},
183 timeout=self._config.http_timeout)
184 return self._handle_response(response)
186 def children(self, textgrid_uri: str, sid: Optional[str] = None) -> Response:
187 """List URIs for all children of this aggregation and its child aggregations
189 Args:
190 textgrid_uri (str): Textgrid URI of an aggregation
191 sid (Optional[str], optional): Session ID. Defaults to None.
193 Raises:
194 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
196 Returns:
197 Response: HTTP response from service - URIs for children of this
198 aggregation and its child aggregations
199 """
200 response = self._requests.get(
201 self._url + '/info/' + textgrid_uri + '/children', params={'sid': sid},
202 timeout=self._config.http_timeout)
203 return self._handle_response(response)
205 @staticmethod
206 def _handle_response(response: Response) -> Response:
207 """Error handling for responses from tgsearch
209 Args:
210 response (Response): a response from tgsearch
212 Raises:
213 TextgridSearchException: if HTTP status code >= 400
215 Returns:
216 Response: the response
217 """
218 if not response.ok: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true
219 message = '[Error] HTTP Code: ' + \
220 str(response.status_code) + ' - ' + response.text[0:255]
221 logger.warning(message)
222 raise TextgridSearchException(message)
223 return response
226class TextgridSearch(TextgridSearchRequest):
227 """Provide access to the TextGrid search service using a XML data binding """
229 def __init__(self, config: TextgridConfig = TextgridConfig(), nonpublic: bool = False) -> None:
230 super().__init__(config, nonpublic)
231 # It’s recommended to either reuse the same parser/serializer instance
232 # or reuse the context instance. see https://xsdata.readthedocs.io/en/latest/xml.html
233 context = XmlContext()
234 self._parser = XmlParser(context=context)
236 def info(self, textgrid_uri: str, sid: Optional[str] = None) -> SearchResponse:
237 """Retrieve metadata for a textgrid object specified by its
238 textgrid-uri
240 Args:
241 textgrid_uri (str): Textgrid URI
242 sid (Optional[str]): Session ID. Defaults to None.
244 Raises:
245 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
247 Returns:
248 SearchResponse: metadata for uri
249 """
250 response = super().info(textgrid_uri, sid)
251 return self._parser.parse(BytesIO(response.content), SearchResponse)
253 def list_project_root(self, project_id: str, sid: Optional[str] = None) -> SearchResponse:
254 """Get objects belonging to a project, filtered by objects that are in
255 an aggregation in the same project.
257 Args:
258 project_id (str): the ID of the project to list
259 sid (Optional[str], optional): Session ID. Defaults to None.
261 Raises:
262 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
264 Returns:
265 SearchResponse: A list of textgrid metadata entries
266 """
267 response = super().list_project_root(project_id, sid)
268 return self._parser.parse(BytesIO(response.content), SearchResponse)
270 def list_aggregation(self, textgrid_uri: str, sid: Optional[str] = None) -> SearchResponse:
271 """Get child resources of an aggregation.
273 Args:
274 textgrid_uri (str): Textgrid URI
275 sid (Optional[str], optional): Session ID. Defaults to None.
277 Raises:
278 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
280 Returns:
281 SearchResponse: A list of textgrid metadata entries
282 """
283 response = super().list_aggregation(textgrid_uri, sid)
284 return self._parser.parse(BytesIO(response.content), SearchResponse)
286 # pylint: disable-msg=too-many-arguments,too-many-locals
288 def search(self,
289 query: Optional[str] = '*',
290 sid: Optional[str] = None,
291 target: Optional[str] = None,
292 order: Optional[str] = None,
293 start: Optional[int] = None,
294 limit: Optional[int] = None,
295 kwic_width: Optional[int] = None,
296 word_distance: Optional[int] = None,
297 path: Optional[bool] = None,
298 all_projects: Optional[bool] = None,
299 sandbox: Optional[bool] = None,
300 filters: Optional[List[str]] = None,
301 facet: Optional[List[str]] = None,
302 facet_limit: Optional[int] = None) -> SearchResponse:
303 """Run fulltext queries or filters on TextGrid metadata and fulltext objects.
305 Please note: as the defaults of this function are mostly set to None, the defaults from
306 the service are used, and also noted in this docstring. see:
307 http://textgridlab.org/doc/services/submodules/tg-search/docs/api/search.html
309 Args:
310 query (str, optional): Lucene search string. Defaults to '*'.
311 sid (str, optional): TextGrid SessionID from tgauth. Defaults to None.
312 target (str, optional): where to do fulltext-searches: one of 'structure',
313 'metadata' and 'both'. Defaults to 'both'.
314 order (str, optional): key-value ascending (asc) or descending (desc) and metadata-field
315 like asc:title or desc:author. Defaults to 'relevance'.
316 start (int, optional): result number to start with.. Defaults to 0.
317 limit (int, optional): number of entries to return.. Defaults to 20.
318 kwic_width (int, optional): number of chars before and after a kwic match.
319 Defaults to 40.
320 word_distance (int, optional): max distance beetween two words in fulltext query.
321 ignored if set to a number < 0, then for a hit all words
322 must be contained in one document. Defaults to -1.
323 path (boo], optional): path of found result(work->edition->aggregations) should be
324 applied to hit. Defaults to false.
325 all_projects (bool, optional): all Projects should be searched for public data,
326 warning: this query may be slow, if many results found.
327 Defaults to false.
328 sandbox (bool, optional): show sandboxed (not yet finally published) data.
329 Defaults to false.
330 filters (List[str], optional): add filter on query results, e.g. for faceting.
331 Defaults to None.
332 facet (List[str], optional): get facets for query results. Defaults to None.
333 facet_limit (int, optional): number of results to return for each facet. Defaults to 10.
335 Raises:
336 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
338 Returns:
339 SearchResponse: a list of textgrid metadata entries,
340 KWIC hits, paths and facets if requested
341 """
343 response = super().search(query=query,
344 sid=sid,
345 target=target,
346 order=order,
347 start=start,
348 limit=limit,
349 kwic_width=kwic_width,
350 word_distance=word_distance,
351 path=path,
352 all_projects=all_projects,
353 sandbox=sandbox,
354 filters=filters,
355 facet=facet,
356 facet_limit=facet_limit)
357 return self._parser.parse(BytesIO(response.content), SearchResponse)
358 # pylint: enable-msg=too-many-arguments,too-many-locals
360 def edition_work_metadata_for(self, textgrid_uri: str,
361 sid: Optional[str] = None) -> SearchResponse:
362 """Find parent edition for an object and the edition and work metadata
364 Args:
365 textgrid_uri (str): Textgrid URI
366 sid (Optional[str], optional): Session ID. Defaults to None.
368 Raises:
369 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
371 Returns:
372 SearchResponse: Edition and work metadata for given object
373 from first matching parent edition
374 """
375 response = super().edition_work_metadata_for(textgrid_uri, sid)
376 return self._parser.parse(BytesIO(response.content), SearchResponse)
378 def children(self, textgrid_uri: str, sid: Optional[str] = None) -> TextgridUris:
379 """List URIs for all children of this aggregation and its child aggregations
381 Args:
382 textgrid_uri (str): Textgrid URI of an aggregation
383 sid (Optional[str], optional): Session ID. Defaults to None.
385 Raises:
386 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402)
388 Returns:
389 TextgridUris: URIs for children of this aggregation and its child aggregations
390 """
391 response = super().children(textgrid_uri, sid)
392 return self._parser.parse(BytesIO(response.content), TextgridUris)
395class TextgridSearchException(Exception):
396 """Exception communicating with tgsearch"""