Coverage for /usr/local/lib/python3.11/site-packages/tgclients/metadata.py: 92%
90 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-02 16:49 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-02 16:49 +0000
1# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen
2#
3# SPDX-License-Identifier: LGPL-3.0-or-later
5"""Helper functions to work with TextGrid metadata XML"""
6import os
7import re
8import logging
9from pathlib import Path
10from typing import Optional
12from jinja2 import Environment, FileSystemLoader
13from xsdata.formats.dataclass.context import XmlContext
14from xsdata.formats.dataclass.parsers import XmlParser
16from tgclients.databinding.tgsearch import Response, ResultType
17from tgclients.databinding.textgrid_metadata_2010 import MetadataContainerType
19try:
20 import icu
21except ImportError:
22 icu = None
24logger = logging.getLogger(__name__)
26__location__ = os.path.realpath(
27 os.path.join(os.getcwd(), os.path.dirname(__file__)))
29class TextgridMetadata:
30 """Helper functions to work with TextGrid metadata XML"""
32 def __init__(self):
33 context = XmlContext()
34 self._parser = XmlParser(context=context)
35 self._file_extension_map = self._build_extension_map()
36 if icu is not None: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 self._transliterator = self._create_transliterator()
39 @staticmethod
40 def create(title: str, mimetype: str) -> str:
41 """Create XML metadata for an TextGrid Object
43 Args:
44 title (str): title of the object
45 mimetype (str): format / MIME type of the object
47 Returns:
48 str: XML metadata as string
49 """
50 path = Path(__file__).parent / 'templates'
51 env = Environment(
52 loader=FileSystemLoader(Path(path)), autoescape=True)
53 template = env.get_template('metadata.xml.jinja2')
54 metadata = template.render(title=title, format=mimetype)
55 return metadata
57 def build(self, title: str, mimetype: str) -> MetadataContainerType:
58 """Build metadata for an TextGrid Object
60 Args:
61 title (str): title of the object
62 mimetype (str): format / MIME type of the object
64 Returns:
65 MetadataContainerType: metadata
66 """
67 metadata = TextgridMetadata().create(title, mimetype)
68 return self._parser.from_string(metadata, MetadataContainerType)
70 def searchresponse2object(self, xml: str) -> Response:
71 return self._parser.from_string(xml, Response)
73 def filename_from_metadata(self, metadata: ResultType) -> str:
74 """Generate a filename for a textgrid search metadata result
75 containg title, textfgrid-uri and extension
77 Args:
78 metadata (ResultType): tgsearch metadata result
80 Returns:
81 str: the filename
82 """
83 if metadata.authorized is False:
84 title = 'Restricted TextGrid Object'
85 mimetype = None
86 else:
87 title = metadata.object_value.generic.provided.title[0]
88 mimetype = metadata.object_value.generic.provided.format
89 uri = metadata.object_value.generic.generated.textgrid_uri.value
90 return self.filename(title, uri, mimetype)
92 def filename(self, title: str, tguri: str, mimetype: str) -> str:
93 """Generate a filename for the triple of
94 title, textfgrid-uri and extension
96 Args:
97 title (str): the title
98 tguri (str): the textgrid uri
99 mimetype (str): the mime type (e.g. 'text/xml')
101 Returns:
102 str: the filename
103 """
104 title = self.transliterate(title)
105 tg_id = self.remove_tg_prefix(tguri)
106 ext = self.extension_for_format(mimetype)
107 if ext is not None:
108 return f'{title}.{tg_id}.{ext}'
109 else:
110 return f'{title}.{tg_id}'
112 def _build_extension_map(self) -> dict:
113 # converted to python from info.textgrid.utils.export.filenames.FileExtensionMap
114 # of link-rewriter (https://gitlab.gwdg.de/dariah-de/textgridrep/link-rewriter)
115 extension_map = {}
116 map_line_pattern = re.compile(
117 '^[ \t]*([^# \t]+)[ \t]*([^#]+)[ \t]*(#.*)?$')
118 space_pattern = re.compile('[ \t]+')
120 with open(os.path.join(__location__, 'mime.types'), encoding='utf8') as mimefile:
121 for line in mimefile.readlines():
122 line_match = map_line_pattern.match(line.rstrip('\n'))
123 if line_match is not None:
124 entry = space_pattern.split(line_match.group(2))
125 # extend the list in the dict, so extensions definded first are first in list
126 if line_match.group(1) not in extension_map:
127 extension_map[line_match.group(1)] = entry
128 else:
129 extension_map[line_match.group(1)].extend(entry)
131 return extension_map
133 def extension_for_format(self, mimetype: str) -> Optional[str]:
134 """Find a matching extension for a textgrid mime type.
135 The first matching extension for a mime type is returned, so
136 extensions defined first in mime.types will be used.
138 Args:
139 mimetype (str): the mime type, as found in textgrid
140 metadata format field (e.g. text/xml)
142 Returns:
143 str: a filename extension
144 """
145 if mimetype in self._file_extension_map:
146 return self._file_extension_map[mimetype][0]
147 else:
148 return None
150 @staticmethod
151 def remove_tg_prefix(tguri: str) -> str:
152 return tguri[9:]
154 @staticmethod
155 def id_from_filename(filename: str) -> str:
156 """extract the id from a filename which is named according to link rewriters
157 textgrid metadata to filename mapping
159 Args:
160 filename (str): the filename
162 Returns:
163 str: the id
164 """
165 last_dot = filename.rfind('.')
166 next_to_last_dot = filename.rfind('.', 0, last_dot)
167 # a textgrid uri has a revision number in the end.
168 # if the chars after the last dot are not numeric, we have a filename extension
169 if not filename[last_dot+1:].isnumeric():
170 # extension is there? we need the '.' before the dot separating the uri
171 # from the revision component
172 next_to_last_dot = filename.rfind('.', 0, next_to_last_dot)
173 else:
174 # there is no extension to cut of, we want the end of the string
175 last_dot = None
177 return filename[next_to_last_dot+1:last_dot]
179 def transliterate(self, title: str) -> str:
180 """replace all chars which may be problematic in filenames from title string
182 Args:
183 title (str): a title from textgrid metadata
185 Returns:
186 str: the title string with problematic chars replaced
187 """
188 name: str = ''
189 if icu is None: 189 ↛ 193line 189 didn't jump to line 193, because the condition on line 189 was never false
190 logger.warning('Transliterating without PyICU, you may need that for correct results')
191 name = title.replace(' ', '_').replace(']', '_').replace('[', '_').replace(':', '_')
192 else:
193 name = self._transliterator.transliterate(title)
194 return name
196 # return type needs to be hidden, because it is not available if PyICU is not installed
197 def _create_transliterator(self):
198 with open(os.path.join(__location__, 'tgfilenames.rules'), encoding='utf8') as rulesfile:
199 rules = rulesfile.read()
200 return icu.Transliterator.createFromRules(
201 'TgFilenames', rules, icu.UTransDirection.FORWARD)