Kallimachos Import

With this notebook we will download the Kallimachos Corpus from GitLab, extract the TextGridUris of the original TEI and create a virtual TextGrid collection of the original TEI files.

1. Download Kallimachos Corpus from GitLab and extract list of TextGrid IDs

# Download the corpus as tar.gz file

import urllib.request
urllib.request.urlretrieve("https://gitlab2.informatik.uni-wuerzburg.de/kallimachos/DROC-Release/-/archive/master/DROC-Release-master.tar.gz?path=droc/DROC-TEI", "corpus.tar.gz")
('corpus.tar.gz', <http.client.HTTPMessage at 0x7f4e78e45b90>)
# Extract all files, get the TextGridUri and append to list of IDs

import tarfile
import xml.etree.ElementTree as ET

tar = tarfile.open("corpus.tar.gz")
ids = []

for member in tar.getmembers():
    f = tar.extractfile(member)
    if f != None:
        tree = ET.parse(f)
        root = tree.getroot()
        elem = root.find("./teiHeader/fileDesc/publicationStmt/idno[@type='TextGridUri']")
        if elem != None: 
            ids.append(elem.text)
print(ids)
['textgrid:jk9g.0', 'textgrid:jkgj.0', 'textgrid:jkjt.0', 'textgrid:jngm.0', 'textgrid:k1d4.0', 'textgrid:k2k7.0', 'textgrid:k2k1.0', 'textgrid:k7bz.0', 'textgrid:k590.0', 'textgrid:k2kp.0', 'textgrid:k8ck.0', 'textgrid:k8b1.0', 'textgrid:k8c0.0', 'textgrid:k8d1.0', 'textgrid:k8d2.0', 'textgrid:k8fk.0', 'textgrid:k8gc.0', 'textgrid:k8f8.0', 'textgrid:kcj8.0', 'textgrid:12671.0', 'textgrid:1266q.0', 'textgrid:kck4.0', 'textgrid:kcn3.0', 'textgrid:kjf3.0', 'textgrid:kjdw.0', 'textgrid:kkr8.0', 'textgrid:kmq8.0', 'textgrid:kpzg.0', 'textgrid:kpxq.0', 'textgrid:kq64.0', 'textgrid:ksbf.0', 'textgrid:ksbr.0', 'textgrid:kxdd.0', 'textgrid:m099.0', 'textgrid:m875.0', 'textgrid:mgq9.0', 'textgrid:mh34.0', 'textgrid:mkkk.0', 'textgrid:mks9.0', 'textgrid:mktj.0', 'textgrid:mksx.0', 'textgrid:mks5.0', 'textgrid:mr1g.0', 'textgrid:mx80.0', 'textgrid:mz31.0', 'textgrid:n143.0', 'textgrid:n1mm.0', 'textgrid:n1zz.0', 'textgrid:n1jg.0', 'textgrid:n1dg.0', 'textgrid:n0zz.0', 'textgrid:n1s9.0', 'textgrid:n22m.0', 'textgrid:n25x.0', 'textgrid:n24m.0', 'textgrid:n27h.0', 'textgrid:nkfh.0', 'textgrid:p5bq.0', 'textgrid:qmnz.0', 'textgrid:qmq9.0', 'textgrid:qmzg.0', 'textgrid:qxcx.0', 'textgrid:r220.0', 'textgrid:rm5b.0', 'textgrid:s6m0.0', 'textgrid:sb70.0', 'textgrid:sb47.0', 'textgrid:sc7w.0', 'textgrid:sdgj.0', 'textgrid:sss7.0', 'textgrid:t26g.0', 'textgrid:t90p.0', 'textgrid:t91c.0', 'textgrid:t90k.0', 'textgrid:tbbm.0', 'textgrid:tbbk.0', 'textgrid:tqbp.0', 'textgrid:vbjc.0', 'textgrid:vqkr.0', 'textgrid:vqmz.0', 'textgrid:vs33.0', 'textgrid:w42k.0', 'textgrid:wf65.0', 'textgrid:wq99.0', 'textgrid:wr68.0', 'textgrid:x271.0', 'textgrid:x8m2.0', 'textgrid:x9j9.0', 'textgrid:xkmw.0']

2. Create a TextGrid project for storing data

Choose using develop or production by commenting in/out below

from tgclients.config import DEV_SERVER
from tgclients import TextgridConfig

# production system
#config = TextgridConfig()

# dev system
config = TextgridConfig(DEV_SERVER)

Get your SID:

  • production system: https://textgridlab.org/1.0/Shibboleth.sso/Login?target=/1.0/secure/TextGrid-WebAuth.php?authZinstance=textgrid-esx2.gwdg.de

  • development system: https://dev.textgridlab.org/1.0/Shibboleth.sso/Login?target=/1.0/secure/TextGrid-WebAuth.php?authZinstance=textgrid-esx1.gwdg.de

and insert below values below

# Insert the SID
SID = ''
# create project
from tgclients import TextgridAuth
tgauth = TextgridAuth(config)

# set name and description for the project
project_id = tgauth.create_project(SID, "01-Kallimachos-Test", "Project created with Jupyter Notebook")

print("project ID is: " + project_id)
Error creating project. Is your sessionID valid?
---------------------------------------------------------------------------
Fault                                     Traceback (most recent call last)
File /usr/local/lib/python3.11/site-packages/tgclients/auth.py:123, in TextgridAuth.create_project(self, sid, name, description, default_owner_roles)
    122 try:
--> 123     project_id = self._client.service.createProject(
    124         auth=sid, name=name, description=description
    125     )
    126 except Fault as fault:

File /usr/local/lib/python3.11/site-packages/zeep/proxy.py:46, in OperationProxy.__call__(self, *args, **kwargs)
     44     kwargs["_soapheaders"] = soap_headers
---> 46 return self._proxy._binding.send(
     47     self._proxy._client,
     48     self._proxy._binding_options,
     49     self._op_name,
     50     args,
     51     kwargs,
     52 )

File /usr/local/lib/python3.11/site-packages/zeep/wsdl/bindings/soap.py:135, in SoapBinding.send(self, client, options, operation, args, kwargs)
    133     return response
--> 135 return self.process_reply(client, operation_obj, response)

File /usr/local/lib/python3.11/site-packages/zeep/wsdl/bindings/soap.py:229, in SoapBinding.process_reply(self, client, operation, response)
    228 if response.status_code != 200 or fault_node is not None:
--> 229     return self.process_error(doc, operation)
    231 result = operation.process_reply(doc)

File /usr/local/lib/python3.11/site-packages/zeep/wsdl/bindings/soap.py:329, in Soap11Binding.process_error(self, doc, operation)
    327         return child.text
--> 329 raise Fault(
    330     message=get_text("faultstring"),
    331     code=get_text("faultcode"),
    332     actor=get_text("faultactor"),
    333     detail=fault_node.find("detail", namespaces=fault_node.nsmap),
    334 )

Fault: 64

The above exception was the direct cause of the following exception:

TextgridAuthException                     Traceback (most recent call last)
Cell In[6], line 6
      3 tgauth = TextgridAuth(config)
      5 # set name and description for the project
----> 6 project_id = tgauth.create_project(SID, "01-Kallimachos-Test", "Project created with Jupyter Notebook")
      8 print("project ID is: " + project_id)

File /usr/local/lib/python3.11/site-packages/tgclients/auth.py:129, in TextgridAuth.create_project(self, sid, name, description, default_owner_roles)
    127     message = 'Error creating project. Is your sessionID valid?'
    128     logger.warning(message)
--> 129     raise TextgridAuthException(message) from fault
    131 if default_owner_roles:
    132     eppn = self.get_eppn_for_sid(sid)

TextgridAuthException: Error creating project. Is your sessionID valid?

Create aggregation of type collection containing the TextGridURIs extracted fom Kallimachos Corpus

from tgclients import TextgridMetadata, Utils, TextgridCrudRequest

# create aggregation from list of ids, as this is a new aggragtion we do not have a textgridURI yet 
# and set it to "not-yet-set"
data = Utils.list_to_aggregation("not-yet-set", ids)
# set the metadata for the aggregation
metadata = TextgridMetadata.create(title='Kalimachos', mimetype='text/tg.collection+tg.aggregation+xml')

# store the aggragation in your project
tgcrud = TextgridCrudRequest(config)
res = tgcrud.create_resource(SID, project_id, data, metadata)

collection_uri = res.headers['Location']
print("The textgridURI of the collection is: " + collection_uri)

3. List contents of this newly created collection with tgsearch

from tgclients import TextgridSearch

tgsearch = TextgridSearch(config)
col_content = tgsearch.list_aggregation(collection_uri, SID)

for item in col_content.result:
    print(item.object_value.generic.provided.title[0])