Coverage for teiphy/variation_unit.py: 100.00%
114 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-01-15 16:06 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-01-15 16:06 +0000
1#!/usr/bin/env python3
3from lxml import etree as et # for reading TEI XML inputs
5from .common import xml_ns, tei_ns
6from .reading import Reading
9class VariationUnit:
10 """Base class for storing TEI XML variation unit data internally.
12 This corresponds to an app element in the collation.
14 Attributes:
15 id: The ID string of this variation unit, which should be unique.
16 readings: A list of Readings contained in this VariationUnit.
17 intrinsic_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to the intrinsic odds category
18 describing the two readings' relative probability of being authorial.
19 transcriptional_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to a set of transcriptional change categories
20 that could explain the rise of the second reading from the first.
21 """
23 def __init__(self, xml: et.Element, verbose: bool = False):
24 """Constructs a new VariationUnit instance from the TEI XML input.
26 Args:
27 xml: An lxml.etree.Element representing an app element.
28 verbose: An optional boolean flag indicating whether or not to print status updates.
29 """
30 # Use its xml:id if it has one; otherwise, use its n, from, and to attributes if it has them:
31 self.id = ""
32 if xml.get("{%s}id" % xml_ns) is not None:
33 self.id = xml.get("{%s}id" % xml_ns)
34 elif xml.get("n") is not None:
35 self.id = xml.get("n")
36 if xml.get("from") is not None and xml.get("to") is not None:
37 self.id += "_" + xml.get("from") + "_" + xml.get("to")
38 # Initialize its list of readings:
39 self.readings = []
40 # Initialize its dictionaries of intrinsic and transcriptional relations:
41 self.intrinsic_relations = {}
42 self.transcriptional_relations_by_date_range = {}
43 # Now parse the app element to populate these data structures:
44 self.parse(xml, verbose)
45 if verbose:
46 print("New VariationUnit %s with %d readings" % (self.id, len(self.readings)))
48 def __str__(self):
49 return self.id
51 def __repr__(self):
52 return str(self)
54 def parse(self, xml: et.Element, verbose: bool = False):
55 """Given an XML element, recursively parses its subelements for readings, reading groups, and witness details.
57 Other children of app elements, such as note, noteGrp, and wit elements, are ignored.
59 Args:
60 xml: An lxml.etree.Element representing an app element.
61 verbose: An optional boolean flag indicating whether or not to print status updates.
62 """
63 # Determine what this element is:
64 raw_tag = xml.tag.replace("{%s}" % tei_ns, "")
65 # If it is an apparatus, then initialize the readings list and process the child elements of the apparatus recursively:
66 if raw_tag == "app":
67 self.readings = []
68 for child in xml:
69 self.parse(child, verbose)
70 return
71 # If it is a reading group, then flatten it by processing its children recursively, applying the reading group's type to the readings contained in it:
72 if raw_tag == "rdgGrp":
73 # Get the type of this reading group:
74 reading_group_type = xml.get("type") if xml.get("type") is not None else None
75 for child in xml:
76 child_raw_tag = child.tag.replace("{%s}" % tei_ns, "")
77 if child_raw_tag in [
78 "lem",
79 "rdg",
80 ]: # any <lem> element in a <rdgGrp> can be assumed not to be a duplicate of a <rdg> element, as there should be only one <lem> at all levels under an <app> element
81 rdg = Reading(child, verbose)
82 if rdg.type is not None:
83 rdg.type = reading_group_type
84 self.readings.append(rdg)
85 else:
86 self.parse(child, verbose)
87 return
88 # If it is a lemma, then add it as a reading if has its own witness list (even if that list is empty, which may occur for a conjecture);
89 # otherwise, assume its reading is duplicated in a <rdg> element and skip it:
90 if raw_tag == "lem":
91 if xml.get("wit") is not None:
92 lem = Reading(xml, verbose)
93 self.readings.append(lem)
94 return
95 # If it is a reading, then add it as a reading:
96 elif raw_tag == "rdg":
97 rdg = Reading(xml, verbose)
98 self.readings.append(rdg)
99 return
100 # If it is a witness detail, then add it as a reading, and if it does not have any targeted readings, then add a target for the previous rdg element (per the TEI Guidelines §12.1.4.1):
101 elif raw_tag == "witDetail":
102 witDetail = Reading(xml, verbose)
103 if len(witDetail.targets) == 0 and len(self.readings) > 0:
104 previous_rdg_ind = -1
105 previous_rdg = self.readings[previous_rdg_ind]
106 while len(previous_rdg.targets) > 0:
107 previous_rdg_ind -= 1
108 previous_rdg = self.readings[previous_rdg_ind]
109 witDetail.targets.append(previous_rdg.id)
110 witDetail.certainties[previous_rdg.id] = 1
111 self.readings.append(witDetail)
112 return
113 # If it is a note, then process the child elements of the note recursively:
114 elif raw_tag == "note":
115 for child in xml:
116 self.parse(child, verbose)
117 return
118 # If it is a list of relations, then populate the corresponding dictionary:
119 elif raw_tag == "listRelation":
120 if xml.get("type") is not None and xml.get("type") == "intrinsic":
121 self.intrinsic_relations = {}
122 for child in xml:
123 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
124 continue
125 from_readings = child.get("active").replace("#", "").split()
126 to_readings = child.get("passive").replace("#", "").split()
127 intrinsic_category = (
128 child.get("ana").replace("#", "").split()[0]
129 ) # there shouldn't be more than one of these
130 # For each pair of readings, assign them the specified category
131 for from_reading in from_readings:
132 for to_reading in to_readings:
133 pair = (from_reading, to_reading)
134 self.intrinsic_relations[pair] = intrinsic_category
135 return
136 if xml.get("type") is not None and xml.get("type") == "transcriptional":
137 self.transcriptional_relations_by_date_range = {}
138 # In a first pass, gather any dates specified for transcriptional relations and sort them in a list:
139 unique_date_strings = set()
140 for child in xml:
141 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
142 continue
143 if child.get("notBefore") is not None:
144 unique_date_strings.add(child.get("notBefore"))
145 if child.get("notAfter") is not None:
146 unique_date_strings.add(child.get("notAfter"))
147 threshold_dates = sorted([int(date_string) for date_string in unique_date_strings])
148 # Then add null entries corresponding to periods before and after the first and last specified dates, respectively:
149 threshold_dates = [None] + threshold_dates + [None]
150 # Then initialize the output dictionary to map each pair of consecutive dates
151 # to a dictionary of the transcriptional relations that hold between them:
152 for i in range(len(threshold_dates) - 1):
153 self.transcriptional_relations_by_date_range[(threshold_dates[i], threshold_dates[i + 1])] = {}
154 # Then, in a second pass, populate a map from (active, passive) reading tuples to their transcriptional categories for each consecutive pairs of dates:
155 for child in xml:
156 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
157 continue
158 from_readings = child.get("active").replace("#", "").split()
159 to_readings = child.get("passive").replace("#", "").split()
160 transcriptional_categories = child.get("ana").replace("#", "").split()
161 date_index_range = [0, len(threshold_dates) - 1]
162 if child.get("notBefore") is not None:
163 date_index_range[0] = threshold_dates.index(int(child.get("notBefore")))
164 if child.get("notAfter") is not None:
165 date_index_range[1] = threshold_dates.index(int(child.get("notAfter")))
166 for i in range(date_index_range[0], date_index_range[1]):
167 # For each pair of readings, assign them to the specified category or categories:
168 for from_reading in from_readings:
169 for to_reading in to_readings:
170 pair = (from_reading, to_reading)
171 if (
172 pair
173 not in self.transcriptional_relations_by_date_range[
174 (threshold_dates[i], threshold_dates[i + 1])
175 ]
176 ):
177 self.transcriptional_relations_by_date_range[
178 (threshold_dates[i], threshold_dates[i + 1])
179 ][
180 pair
181 ] = set() # we only need distinct categories for each transition
182 for transcriptional_category in transcriptional_categories:
183 self.transcriptional_relations_by_date_range[
184 (threshold_dates[i], threshold_dates[i + 1])
185 ][pair].add(transcriptional_category)
186 return
187 return