Coverage for /usr/local/lib/python3.11/site-packages/tgclients/metadata.py: 92%

90 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-02 16:49 +0000

1# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen 

2# 

3# SPDX-License-Identifier: LGPL-3.0-or-later 

4 

5"""Helper functions to work with TextGrid metadata XML""" 

6import os 

7import re 

8import logging 

9from pathlib import Path 

10from typing import Optional 

11 

12from jinja2 import Environment, FileSystemLoader 

13from xsdata.formats.dataclass.context import XmlContext 

14from xsdata.formats.dataclass.parsers import XmlParser 

15 

16from tgclients.databinding.tgsearch import Response, ResultType 

17from tgclients.databinding.textgrid_metadata_2010 import MetadataContainerType 

18 

19try: 

20 import icu 

21except ImportError: 

22 icu = None 

23 

24logger = logging.getLogger(__name__) 

25 

26__location__ = os.path.realpath( 

27 os.path.join(os.getcwd(), os.path.dirname(__file__))) 

28 

29class TextgridMetadata: 

30 """Helper functions to work with TextGrid metadata XML""" 

31 

32 def __init__(self): 

33 context = XmlContext() 

34 self._parser = XmlParser(context=context) 

35 self._file_extension_map = self._build_extension_map() 

36 if icu is not None: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 self._transliterator = self._create_transliterator() 

38 

39 @staticmethod 

40 def create(title: str, mimetype: str) -> str: 

41 """Create XML metadata for an TextGrid Object 

42 

43 Args: 

44 title (str): title of the object 

45 mimetype (str): format / MIME type of the object 

46 

47 Returns: 

48 str: XML metadata as string 

49 """ 

50 path = Path(__file__).parent / 'templates' 

51 env = Environment( 

52 loader=FileSystemLoader(Path(path)), autoescape=True) 

53 template = env.get_template('metadata.xml.jinja2') 

54 metadata = template.render(title=title, format=mimetype) 

55 return metadata 

56 

57 def build(self, title: str, mimetype: str) -> MetadataContainerType: 

58 """Build metadata for an TextGrid Object 

59 

60 Args: 

61 title (str): title of the object 

62 mimetype (str): format / MIME type of the object 

63 

64 Returns: 

65 MetadataContainerType: metadata 

66 """ 

67 metadata = TextgridMetadata().create(title, mimetype) 

68 return self._parser.from_string(metadata, MetadataContainerType) 

69 

70 def searchresponse2object(self, xml: str) -> Response: 

71 return self._parser.from_string(xml, Response) 

72 

73 def filename_from_metadata(self, metadata: ResultType) -> str: 

74 """Generate a filename for a textgrid search metadata result 

75 containg title, textfgrid-uri and extension 

76 

77 Args: 

78 metadata (ResultType): tgsearch metadata result 

79 

80 Returns: 

81 str: the filename 

82 """ 

83 if metadata.authorized is False: 

84 title = 'Restricted TextGrid Object' 

85 mimetype = None 

86 else: 

87 title = metadata.object_value.generic.provided.title[0] 

88 mimetype = metadata.object_value.generic.provided.format 

89 uri = metadata.object_value.generic.generated.textgrid_uri.value 

90 return self.filename(title, uri, mimetype) 

91 

92 def filename(self, title: str, tguri: str, mimetype: str) -> str: 

93 """Generate a filename for the triple of 

94 title, textfgrid-uri and extension 

95 

96 Args: 

97 title (str): the title 

98 tguri (str): the textgrid uri 

99 mimetype (str): the mime type (e.g. 'text/xml') 

100 

101 Returns: 

102 str: the filename 

103 """ 

104 title = self.transliterate(title) 

105 tg_id = self.remove_tg_prefix(tguri) 

106 ext = self.extension_for_format(mimetype) 

107 if ext is not None: 

108 return f'{title}.{tg_id}.{ext}' 

109 else: 

110 return f'{title}.{tg_id}' 

111 

112 def _build_extension_map(self) -> dict: 

113 # converted to python from info.textgrid.utils.export.filenames.FileExtensionMap 

114 # of link-rewriter (https://gitlab.gwdg.de/dariah-de/textgridrep/link-rewriter) 

115 extension_map = {} 

116 map_line_pattern = re.compile( 

117 '^[ \t]*([^# \t]+)[ \t]*([^#]+)[ \t]*(#.*)?$') 

118 space_pattern = re.compile('[ \t]+') 

119 

120 with open(os.path.join(__location__, 'mime.types'), encoding='utf8') as mimefile: 

121 for line in mimefile.readlines(): 

122 line_match = map_line_pattern.match(line.rstrip('\n')) 

123 if line_match is not None: 

124 entry = space_pattern.split(line_match.group(2)) 

125 # extend the list in the dict, so extensions definded first are first in list 

126 if line_match.group(1) not in extension_map: 

127 extension_map[line_match.group(1)] = entry 

128 else: 

129 extension_map[line_match.group(1)].extend(entry) 

130 

131 return extension_map 

132 

133 def extension_for_format(self, mimetype: str) -> Optional[str]: 

134 """Find a matching extension for a textgrid mime type. 

135 The first matching extension for a mime type is returned, so 

136 extensions defined first in mime.types will be used. 

137 

138 Args: 

139 mimetype (str): the mime type, as found in textgrid 

140 metadata format field (e.g. text/xml) 

141 

142 Returns: 

143 str: a filename extension 

144 """ 

145 if mimetype in self._file_extension_map: 

146 return self._file_extension_map[mimetype][0] 

147 else: 

148 return None 

149 

150 @staticmethod 

151 def remove_tg_prefix(tguri: str) -> str: 

152 return tguri[9:] 

153 

154 @staticmethod 

155 def id_from_filename(filename: str) -> str: 

156 """extract the id from a filename which is named according to link rewriters 

157 textgrid metadata to filename mapping 

158 

159 Args: 

160 filename (str): the filename 

161 

162 Returns: 

163 str: the id 

164 """ 

165 last_dot = filename.rfind('.') 

166 next_to_last_dot = filename.rfind('.', 0, last_dot) 

167 # a textgrid uri has a revision number in the end. 

168 # if the chars after the last dot are not numeric, we have a filename extension 

169 if not filename[last_dot+1:].isnumeric(): 

170 # extension is there? we need the '.' before the dot separating the uri 

171 # from the revision component 

172 next_to_last_dot = filename.rfind('.', 0, next_to_last_dot) 

173 else: 

174 # there is no extension to cut of, we want the end of the string 

175 last_dot = None 

176 

177 return filename[next_to_last_dot+1:last_dot] 

178 

179 def transliterate(self, title: str) -> str: 

180 """replace all chars which may be problematic in filenames from title string 

181 

182 Args: 

183 title (str): a title from textgrid metadata 

184 

185 Returns: 

186 str: the title string with problematic chars replaced 

187 """ 

188 name: str = '' 

189 if icu is None: 189 ↛ 193line 189 didn't jump to line 193, because the condition on line 189 was never false

190 logger.warning('Transliterating without PyICU, you may need that for correct results') 

191 name = title.replace(' ', '_').replace(']', '_').replace('[', '_').replace(':', '_') 

192 else: 

193 name = self._transliterator.transliterate(title) 

194 return name 

195 

196 # return type needs to be hidden, because it is not available if PyICU is not installed 

197 def _create_transliterator(self): 

198 with open(os.path.join(__location__, 'tgfilenames.rules'), encoding='utf8') as rulesfile: 

199 rules = rulesfile.read() 

200 return icu.Transliterator.createFromRules( 

201 'TgFilenames', rules, icu.UTransDirection.FORWARD)