Coverage for /usr/local/lib/python3.8/site-packages/tgclients/search.py: 86%

69 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-02 16:49 +0000

1# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen 

2# 

3# SPDX-License-Identifier: LGPL-3.0-or-later 

4 

5"""TextGrid Search API.""" 

6import logging 

7from io import BytesIO 

8from typing import Optional, List 

9 

10import requests 

11from requests.models import Response 

12from xsdata.formats.dataclass.context import XmlContext 

13from xsdata.formats.dataclass.parsers import XmlParser 

14 

15from tgclients.config import TextgridConfig 

16from tgclients.databinding.tgsearch import Response as SearchResponse, TextgridUris 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21class TextgridSearchRequest: 

22 """Provide low level access to the TextGrid search service, returning the response objects.""" 

23 

24 def __init__(self, config: TextgridConfig = TextgridConfig(), nonpublic: bool = False) -> None: 

25 if nonpublic: 25 ↛ 26line 25 didn't jump to line 26, because the condition on line 25 was never true

26 self._url = config.search 

27 else: 

28 self._url = config.search_public 

29 self._config = config 

30 # reuse tcp connections: https://requests.readthedocs.io/en/latest/user/advanced/#session-objects 

31 self._requests = requests.Session() 

32 

33 def info(self, textgrid_uri: str, sid: Optional[str] = None) -> Response: 

34 """Retrieve metadata for a textgrid object specified by its 

35 textgrid-uri 

36 

37 Args: 

38 textgrid_uri (str): Textgrid URI 

39 sid (Optional[str]): Session ID. Defaults to None. 

40 

41 Raises: 

42 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

43 

44 Returns: 

45 Response: metadata for uri 

46 """ 

47 url = self._url + '/info/' 

48 response = self._requests.get( 

49 url + textgrid_uri, params={'sid': sid}, timeout=self._config.http_timeout) 

50 return self._handle_response(response) 

51 

52 def list_project_root(self, project_id: str, sid: Optional[str] = None) -> Response: 

53 """Get objects belonging to a project, filtered by objects that are in 

54 an aggregation in the same project. 

55 

56 Args: 

57 project_id (str): the ID of the project to list 

58 sid (Optional[str], optional): Session ID. Defaults to None. 

59 

60 Raises: 

61 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

62 

63 Returns: 

64 Response: HTTP response from service, containing a list of textgrid metadata entries 

65 """ 

66 response = self._requests.get( 

67 self._url + '/navigation/' + project_id, params={'sid': sid}, 

68 timeout=self._config.http_timeout) 

69 return self._handle_response(response) 

70 

71 def list_aggregation(self, textgrid_uri: str, sid: Optional[str] = None) -> Response: 

72 """Get child resources of an aggregation. 

73 

74 Args: 

75 textgrid_uri (str): Textgrid URI 

76 sid (Optional[str], optional): Session ID. Defaults to None. 

77 

78 Raises: 

79 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

80 

81 Returns: 

82 Response: HTTP response from service, containing a list of textgrid metadata entries 

83 """ 

84 response = self._requests.get(self._url + '/navigation/agg/' + 

85 textgrid_uri, params={'sid': sid}, 

86 timeout=self._config.http_timeout) 

87 return self._handle_response(response) 

88 

89 # pylint: disable-msg=too-many-arguments,too-many-locals 

90 def search(self, 

91 query: Optional[str] = '*', 

92 sid: Optional[str] = None, 

93 target: Optional[str] = None, 

94 order: Optional[str] = None, 

95 start: Optional[int] = None, 

96 limit: Optional[int] = None, 

97 kwic_width: Optional[int] = None, 

98 word_distance: Optional[int] = None, 

99 path: Optional[bool] = None, 

100 all_projects: Optional[bool] = None, 

101 sandbox: Optional[bool] = None, 

102 filters: Optional[List[str]] = None, 

103 facet: Optional[List[str]] = None, 

104 facet_limit: Optional[int] = None) -> Response: 

105 """Run fulltext queries or filters on TextGrid metadata and fulltext objects. 

106 

107 Please note: as the defaults of this function are mostly set to None, the defaults from 

108 the service are used, and also noted in this docstring. see: 

109 http://textgridlab.org/doc/services/submodules/tg-search/docs/api/search.html 

110 

111 Args: 

112 query (str, optional): Lucene search string. Defaults to '*'. 

113 sid (str, optional): TextGrid SessionID from tgauth. Defaults to None. 

114 target (str, optional): where to do fulltext-searches: one of 'structure', 

115 'metadata' and 'both'. Defaults to 'both'. 

116 order (str, optional): key-value ascending (asc) or descending (desc) and metadata-field 

117 like asc:title or desc:author. Defaults to 'relevance'. 

118 start (int, optional): result number to start with.. Defaults to 0. 

119 limit (int, optional): number of entries to return.. Defaults to 20. 

120 kwic_width (int, optional): number of chars before and after a kwic match. 

121 Defaults to 40. 

122 word_distance (int, optional): max distance beetween two words in fulltext query. 

123 ignored if set to a number < 0, then for a hit all words 

124 must be contained in one document. Defaults to -1. 

125 path (boo], optional): path of found result(work->edition->aggregations) should be 

126 applied to hit. Defaults to false. 

127 all_projects (bool, optional): all Projects should be searched for public data, 

128 warning: this query may be slow, if many results found. 

129 Defaults to false. 

130 sandbox (bool, optional): show sandboxed (not yet finally published) data. 

131 Defaults to false. 

132 filters (List[str], optional): add filter on query results, e.g. for faceting. 

133 Defaults to None. 

134 facet (List[str], optional): get facets for query results. Defaults to None. 

135 facet_limit (int, optional): number of results to return for each facet. Defaults to 10. 

136 

137 Raises: 

138 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

139 

140 Returns: 

141 Response: HTTP response from service - a list of textgrid metadata entries, 

142 KWIC hits, paths and facets if requested 

143 """ 

144 

145 params = { 

146 'q': query, 

147 'sid': sid, 

148 'target': target, 

149 'order': order, 

150 'start': start, 

151 'limit': limit, 

152 'kwicWidth': kwic_width, 

153 'wordDistance': word_distance, 

154 'path': path, 

155 'allProjects': all_projects, 

156 'sandbox': sandbox, 

157 'filter': filters, 

158 'facet': facet, 

159 'facetLimit': facet_limit 

160 } 

161 response = self._requests.get(self._url + '/search', params=params, 

162 timeout=self._config.http_timeout) 

163 return self._handle_response(response) 

164 # pylint: enable-msg=too-many-arguments,too-many-locals 

165 

166 def edition_work_metadata_for(self, textgrid_uri: str, sid: Optional[str] = None) -> Response: 

167 """Find parent edition for an object and the edition and work metadata 

168 

169 Args: 

170 textgrid_uri (str): Textgrid URI 

171 sid (Optional[str], optional): Session ID. Defaults to None. 

172 

173 Raises: 

174 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

175 

176 Returns: 

177 Response: HTTP response from service - edition and work metadata for given object 

178 from first matching parent edition 

179 """ 

180 response = self._requests.get( 

181 self._url + '/info/' + textgrid_uri + 

182 '/editionWorkMeta', params={'sid': sid}, 

183 timeout=self._config.http_timeout) 

184 return self._handle_response(response) 

185 

186 def children(self, textgrid_uri: str, sid: Optional[str] = None) -> Response: 

187 """List URIs for all children of this aggregation and its child aggregations 

188 

189 Args: 

190 textgrid_uri (str): Textgrid URI of an aggregation 

191 sid (Optional[str], optional): Session ID. Defaults to None. 

192 

193 Raises: 

194 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

195 

196 Returns: 

197 Response: HTTP response from service - URIs for children of this 

198 aggregation and its child aggregations 

199 """ 

200 response = self._requests.get( 

201 self._url + '/info/' + textgrid_uri + '/children', params={'sid': sid}, 

202 timeout=self._config.http_timeout) 

203 return self._handle_response(response) 

204 

205 @staticmethod 

206 def _handle_response(response: Response) -> Response: 

207 """Error handling for responses from tgsearch 

208 

209 Args: 

210 response (Response): a response from tgsearch 

211 

212 Raises: 

213 TextgridSearchException: if HTTP status code >= 400 

214 

215 Returns: 

216 Response: the response 

217 """ 

218 if not response.ok: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true

219 message = '[Error] HTTP Code: ' + \ 

220 str(response.status_code) + ' - ' + response.text[0:255] 

221 logger.warning(message) 

222 raise TextgridSearchException(message) 

223 return response 

224 

225 

226class TextgridSearch(TextgridSearchRequest): 

227 """Provide access to the TextGrid search service using a XML data binding """ 

228 

229 def __init__(self, config: TextgridConfig = TextgridConfig(), nonpublic: bool = False) -> None: 

230 super().__init__(config, nonpublic) 

231 # It’s recommended to either reuse the same parser/serializer instance 

232 # or reuse the context instance. see https://xsdata.readthedocs.io/en/latest/xml.html 

233 context = XmlContext() 

234 self._parser = XmlParser(context=context) 

235 

236 def info(self, textgrid_uri: str, sid: Optional[str] = None) -> SearchResponse: 

237 """Retrieve metadata for a textgrid object specified by its 

238 textgrid-uri 

239 

240 Args: 

241 textgrid_uri (str): Textgrid URI 

242 sid (Optional[str]): Session ID. Defaults to None. 

243 

244 Raises: 

245 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

246 

247 Returns: 

248 SearchResponse: metadata for uri 

249 """ 

250 response = super().info(textgrid_uri, sid) 

251 return self._parser.parse(BytesIO(response.content), SearchResponse) 

252 

253 def list_project_root(self, project_id: str, sid: Optional[str] = None) -> SearchResponse: 

254 """Get objects belonging to a project, filtered by objects that are in 

255 an aggregation in the same project. 

256 

257 Args: 

258 project_id (str): the ID of the project to list 

259 sid (Optional[str], optional): Session ID. Defaults to None. 

260 

261 Raises: 

262 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

263 

264 Returns: 

265 SearchResponse: A list of textgrid metadata entries 

266 """ 

267 response = super().list_project_root(project_id, sid) 

268 return self._parser.parse(BytesIO(response.content), SearchResponse) 

269 

270 def list_aggregation(self, textgrid_uri: str, sid: Optional[str] = None) -> SearchResponse: 

271 """Get child resources of an aggregation. 

272 

273 Args: 

274 textgrid_uri (str): Textgrid URI 

275 sid (Optional[str], optional): Session ID. Defaults to None. 

276 

277 Raises: 

278 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

279 

280 Returns: 

281 SearchResponse: A list of textgrid metadata entries 

282 """ 

283 response = super().list_aggregation(textgrid_uri, sid) 

284 return self._parser.parse(BytesIO(response.content), SearchResponse) 

285 

286 # pylint: disable-msg=too-many-arguments,too-many-locals 

287 

288 def search(self, 

289 query: Optional[str] = '*', 

290 sid: Optional[str] = None, 

291 target: Optional[str] = None, 

292 order: Optional[str] = None, 

293 start: Optional[int] = None, 

294 limit: Optional[int] = None, 

295 kwic_width: Optional[int] = None, 

296 word_distance: Optional[int] = None, 

297 path: Optional[bool] = None, 

298 all_projects: Optional[bool] = None, 

299 sandbox: Optional[bool] = None, 

300 filters: Optional[List[str]] = None, 

301 facet: Optional[List[str]] = None, 

302 facet_limit: Optional[int] = None) -> SearchResponse: 

303 """Run fulltext queries or filters on TextGrid metadata and fulltext objects. 

304 

305 Please note: as the defaults of this function are mostly set to None, the defaults from 

306 the service are used, and also noted in this docstring. see: 

307 http://textgridlab.org/doc/services/submodules/tg-search/docs/api/search.html 

308 

309 Args: 

310 query (str, optional): Lucene search string. Defaults to '*'. 

311 sid (str, optional): TextGrid SessionID from tgauth. Defaults to None. 

312 target (str, optional): where to do fulltext-searches: one of 'structure', 

313 'metadata' and 'both'. Defaults to 'both'. 

314 order (str, optional): key-value ascending (asc) or descending (desc) and metadata-field 

315 like asc:title or desc:author. Defaults to 'relevance'. 

316 start (int, optional): result number to start with.. Defaults to 0. 

317 limit (int, optional): number of entries to return.. Defaults to 20. 

318 kwic_width (int, optional): number of chars before and after a kwic match. 

319 Defaults to 40. 

320 word_distance (int, optional): max distance beetween two words in fulltext query. 

321 ignored if set to a number < 0, then for a hit all words 

322 must be contained in one document. Defaults to -1. 

323 path (boo], optional): path of found result(work->edition->aggregations) should be 

324 applied to hit. Defaults to false. 

325 all_projects (bool, optional): all Projects should be searched for public data, 

326 warning: this query may be slow, if many results found. 

327 Defaults to false. 

328 sandbox (bool, optional): show sandboxed (not yet finally published) data. 

329 Defaults to false. 

330 filters (List[str], optional): add filter on query results, e.g. for faceting. 

331 Defaults to None. 

332 facet (List[str], optional): get facets for query results. Defaults to None. 

333 facet_limit (int, optional): number of results to return for each facet. Defaults to 10. 

334 

335 Raises: 

336 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

337 

338 Returns: 

339 SearchResponse: a list of textgrid metadata entries, 

340 KWIC hits, paths and facets if requested 

341 """ 

342 

343 response = super().search(query=query, 

344 sid=sid, 

345 target=target, 

346 order=order, 

347 start=start, 

348 limit=limit, 

349 kwic_width=kwic_width, 

350 word_distance=word_distance, 

351 path=path, 

352 all_projects=all_projects, 

353 sandbox=sandbox, 

354 filters=filters, 

355 facet=facet, 

356 facet_limit=facet_limit) 

357 return self._parser.parse(BytesIO(response.content), SearchResponse) 

358 # pylint: enable-msg=too-many-arguments,too-many-locals 

359 

360 def edition_work_metadata_for(self, textgrid_uri: str, 

361 sid: Optional[str] = None) -> SearchResponse: 

362 """Find parent edition for an object and the edition and work metadata 

363 

364 Args: 

365 textgrid_uri (str): Textgrid URI 

366 sid (Optional[str], optional): Session ID. Defaults to None. 

367 

368 Raises: 

369 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

370 

371 Returns: 

372 SearchResponse: Edition and work metadata for given object 

373 from first matching parent edition 

374 """ 

375 response = super().edition_work_metadata_for(textgrid_uri, sid) 

376 return self._parser.parse(BytesIO(response.content), SearchResponse) 

377 

378 def children(self, textgrid_uri: str, sid: Optional[str] = None) -> TextgridUris: 

379 """List URIs for all children of this aggregation and its child aggregations 

380 

381 Args: 

382 textgrid_uri (str): Textgrid URI of an aggregation 

383 sid (Optional[str], optional): Session ID. Defaults to None. 

384 

385 Raises: 

386 TextgridSearchException: if HTTP status code >= 400 (# noqa: DAR402) 

387 

388 Returns: 

389 TextgridUris: URIs for children of this aggregation and its child aggregations 

390 """ 

391 response = super().children(textgrid_uri, sid) 

392 return self._parser.parse(BytesIO(response.content), TextgridUris) 

393 

394 

395class TextgridSearchException(Exception): 

396 """Exception communicating with tgsearch"""