Coverage for teiphy/variation_unit.py: 100.00%
121 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-15 16:57 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-15 16:57 +0000
1#!/usr/bin/env python3
3from lxml import etree as et # for reading TEI XML inputs
5from .common import xml_ns, tei_ns
6from .reading import Reading
9class VariationUnit:
10 """Base class for storing TEI XML variation unit data internally.
12 This corresponds to an app element in the collation.
14 Attributes:
15 id: The ID string of this variation unit, which should be unique.
16 readings: A list of Readings contained in this VariationUnit.
17 intrinsic_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to the intrinsic odds category
18 describing the two readings' relative probability of being authorial.
19 transcriptional_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to a set of transcriptional change categories
20 that could explain the rise of the second reading from the first.
21 """
23 def __init__(self, xml: et.Element, verbose: bool = False):
24 """Constructs a new VariationUnit instance from the TEI XML input.
26 Args:
27 xml: An lxml.etree.Element representing an app element.
28 verbose: An optional boolean flag indicating whether or not to print status updates.
29 """
30 # Use its xml:id if it has one; otherwise, use its n, from, and to attributes if it has them:
31 self.id = ""
32 if xml.get("{%s}id" % xml_ns) is not None:
33 self.id = xml.get("{%s}id" % xml_ns)
34 elif xml.get("n") is not None:
35 self.id = xml.get("n")
36 if xml.get("from") is not None and xml.get("to") is not None:
37 if xml.get("from") != xml.get("to"):
38 self.id += "_" + xml.get("from") + "_" + xml.get("to") # range of word indices
39 else:
40 self.id += "_" + xml.get("from") # single word index
41 # Initialize its list of analysis categories (for weighting changes in this unit in the stemma program):
42 self.analysis_categories = []
43 if xml.get("ana") is not None:
44 ana_split = xml.get("ana").split()
45 for ana in ana_split:
46 self.analysis_categories.append(ana.strip("#")) # remove any leading hashtags
47 # Initialize its list of readings:
48 self.readings = []
49 # Initialize its dictionaries of intrinsic and transcriptional relations:
50 self.intrinsic_relations = {}
51 self.transcriptional_relations_by_date_range = {}
52 # Now parse the app element to populate these data structures:
53 self.parse(xml, verbose)
54 if verbose:
55 print("New VariationUnit %s with %d readings" % (self.id, len(self.readings)))
57 def __str__(self):
58 return self.id
60 def __repr__(self):
61 return str(self)
63 def parse(self, xml: et.Element, verbose: bool = False):
64 """Given an XML element, recursively parses its subelements for readings, reading groups, and witness details.
66 Other children of app elements, such as note, noteGrp, and wit elements, are ignored.
68 Args:
69 xml: An lxml.etree.Element representing an app element.
70 verbose: An optional boolean flag indicating whether or not to print status updates.
71 """
72 # Determine what this element is:
73 raw_tag = xml.tag.replace("{%s}" % tei_ns, "")
74 # If it is an apparatus, then initialize the readings list and process the child elements of the apparatus recursively:
75 if raw_tag == "app":
76 self.readings = []
77 for child in xml:
78 self.parse(child, verbose)
79 return
80 # If it is a reading group, then flatten it by processing its children recursively, applying the reading group's type to the readings contained in it:
81 if raw_tag == "rdgGrp":
82 # Get the type of this reading group:
83 reading_group_type = xml.get("type") if xml.get("type") is not None else None
84 for child in xml:
85 child_raw_tag = child.tag.replace("{%s}" % tei_ns, "")
86 if child_raw_tag in [
87 "lem",
88 "rdg",
89 ]: # any <lem> element in a <rdgGrp> can be assumed not to be a duplicate of a <rdg> element, as there should be only one <lem> at all levels under an <app> element
90 rdg = Reading(child, verbose)
91 if rdg.type is not None:
92 rdg.type = reading_group_type
93 self.readings.append(rdg)
94 else:
95 self.parse(child, verbose)
96 return
97 # If it is a lemma, then add it as a reading if has its own witness list (even if that list is empty, which may occur for a conjecture);
98 # otherwise, assume its reading is duplicated in a <rdg> element and skip it:
99 if raw_tag == "lem":
100 if xml.get("wit") is not None:
101 lem = Reading(xml, verbose)
102 self.readings.append(lem)
103 return
104 # If it is a reading, then add it as a reading:
105 elif raw_tag == "rdg":
106 rdg = Reading(xml, verbose)
107 self.readings.append(rdg)
108 return
109 # If it is a witness detail, then add it as a reading, and if it does not have any targeted readings, then add a target for the previous rdg element (per the TEI Guidelines §12.1.4.1):
110 elif raw_tag == "witDetail":
111 witDetail = Reading(xml, verbose)
112 if len(witDetail.targets) == 0 and len(self.readings) > 0:
113 previous_rdg_ind = -1
114 previous_rdg = self.readings[previous_rdg_ind]
115 while len(previous_rdg.targets) > 0:
116 previous_rdg_ind -= 1
117 previous_rdg = self.readings[previous_rdg_ind]
118 witDetail.targets.append(previous_rdg.id)
119 witDetail.certainties[previous_rdg.id] = 1
120 self.readings.append(witDetail)
121 return
122 # If it is a note, then process the child elements of the note recursively:
123 elif raw_tag == "note":
124 for child in xml:
125 self.parse(child, verbose)
126 return
127 # If it is a list of relations, then populate the corresponding dictionary:
128 elif raw_tag == "listRelation":
129 if xml.get("type") is not None and xml.get("type") == "intrinsic":
130 self.intrinsic_relations = {}
131 for child in xml:
132 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
133 continue
134 from_readings = child.get("active").replace("#", "").split()
135 to_readings = child.get("passive").replace("#", "").split()
136 intrinsic_category = (
137 child.get("ana").replace("#", "").split()[0]
138 ) # there shouldn't be more than one of these
139 # For each pair of readings, assign them the specified category
140 for from_reading in from_readings:
141 for to_reading in to_readings:
142 pair = (from_reading, to_reading)
143 self.intrinsic_relations[pair] = intrinsic_category
144 return
145 if xml.get("type") is not None and xml.get("type") == "transcriptional":
146 self.transcriptional_relations_by_date_range = {}
147 # In a first pass, gather any dates specified for transcriptional relations and sort them in a list:
148 unique_date_strings = set()
149 for child in xml:
150 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
151 continue
152 if child.get("notBefore") is not None:
153 unique_date_strings.add(child.get("notBefore"))
154 if child.get("notAfter") is not None:
155 unique_date_strings.add(child.get("notAfter"))
156 threshold_dates = sorted([int(date_string) for date_string in unique_date_strings])
157 # Then add null entries corresponding to periods before and after the first and last specified dates, respectively:
158 threshold_dates = [None] + threshold_dates + [None]
159 # Then initialize the output dictionary to map each pair of consecutive dates
160 # to a dictionary of the transcriptional relations that hold between them:
161 for i in range(len(threshold_dates) - 1):
162 self.transcriptional_relations_by_date_range[(threshold_dates[i], threshold_dates[i + 1])] = {}
163 # Then, in a second pass, populate a map from (active, passive) reading tuples to their transcriptional categories for each consecutive pairs of dates:
164 for child in xml:
165 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
166 continue
167 from_readings = child.get("active").replace("#", "").split()
168 to_readings = child.get("passive").replace("#", "").split()
169 transcriptional_categories = child.get("ana").replace("#", "").split()
170 date_index_range = [0, len(threshold_dates) - 1]
171 if child.get("notBefore") is not None:
172 date_index_range[0] = threshold_dates.index(int(child.get("notBefore")))
173 if child.get("notAfter") is not None:
174 date_index_range[1] = threshold_dates.index(int(child.get("notAfter")))
175 for i in range(date_index_range[0], date_index_range[1]):
176 # For each pair of readings, assign them to the specified category or categories:
177 for from_reading in from_readings:
178 for to_reading in to_readings:
179 pair = (from_reading, to_reading)
180 if (
181 pair
182 not in self.transcriptional_relations_by_date_range[
183 (threshold_dates[i], threshold_dates[i + 1])
184 ]
185 ):
186 self.transcriptional_relations_by_date_range[
187 (threshold_dates[i], threshold_dates[i + 1])
188 ][
189 pair
190 ] = set() # we only need distinct categories for each transition
191 for transcriptional_category in transcriptional_categories:
192 self.transcriptional_relations_by_date_range[
193 (threshold_dates[i], threshold_dates[i + 1])
194 ][pair].add(transcriptional_category)
195 return
196 return