Coverage for teiphy/reading.py: 100.00%
162 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-01-15 16:06 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-01-15 16:06 +0000
1#!/usr/bin/env python3
3from lxml import etree as et
5from .common import xml_ns, tei_ns
8class Reading:
9 """Base class for storing TEI XML reading data internally.
11 This can correspond to a lem, rdg, or witDetail element in the collation.
13 Attributes:
14 id: The ID string of this reading, which should be unique within its parent app element.
15 type: A string representing the type of reading. Examples include "reconstructed", "defective", "orthographic", "subreading", "ambiguous", "overlap", and "lac". The default value is "substantive".
16 text: Serialization of the contents of this element.
17 wits: A list of sigla referring to witnesses that support this reading.
18 targets: A list of other reading ID strings to which this reading corresponds. For substantive readings, this should be empty. For ambiguous readings, it should contain references to the readings that might correspond to this one. For overlap readings, it should contain a reference to the reading from the overlapping variation unit responsible for the overlap.
19 certainties: A dictionary mapping target reading IDs to floating-point certainty values.
20 """
22 def __init__(self, xml: et.Element, verbose: bool = False):
23 """Constructs a new Reading instance from the TEI XML input.
25 Args:
26 xml: A lem, rdg, or witDetail element.
27 verbose: An optional flag indicating whether or not to print status updates.
28 """
29 self.type = ""
30 self.text = ""
31 self.id = ""
32 self.targets = []
33 self.certainties = {}
34 self.wits = []
35 self.parse(xml, verbose)
36 if verbose:
37 if len(self.wits) == 0:
38 if self.text != "":
39 print("New Reading %s with type %s, no witnesses, and text %s" % (self.id, self.type, self.text))
40 else:
41 print("New Reading %s with type %s, no witnesses, and no text" % (self.id, self.type))
42 else:
43 if self.text != "":
44 print(
45 "New Reading %s with type %s, witnesses %s, and text %s"
46 % (self.id, self.type, ", ".join([wit for wit in self.wits]), self.text)
47 )
48 else:
49 print(
50 "New Reading %s with type %s, witnesses %s, and no text"
51 % (self.id, self.type, ", ".join([wit for wit in self.wits]))
52 )
54 def parse(self, xml: et.Element, verbose: bool = False):
55 """Given an XML element, recursively parses it and its subelements.
57 Args:
58 xml: A lem, rdg, or witDetail element.
59 verbose: An optional flag indicating whether or not to print status updates.
60 """
61 # Determine what this element is:
62 raw_tag = xml.tag.replace("{%s}" % tei_ns, "")
63 # If it is a reading or lemma, then copy its witnesses, and recursively process its children:
64 if raw_tag in ["rdg", "lem"]:
65 # If it has a type, then save that; otherwise, default to "substantive":
66 self.type = xml.get("type") if xml.get("type") is not None else "substantive"
67 # Populate its list of the entries in its wit attribute (stripping any "#" prefixes), split over spaces:
68 self.wits = [w.strip("#") for w in xml.get("wit").split()] if xml.get("wit") is not None else []
69 # Populate its text recursively using its children:
70 self.text = xml.text if xml.text is not None else ""
71 for child in xml:
72 self.parse(child, verbose)
73 # Strip any surrounding whitespace left over from spaces added between word elements:
74 self.text = self.text.strip()
75 # Populate its ID, using its xml:id if it has one; otherwise, use its n attribute if it has one; otherwise, use its text:
76 self.id = ""
77 if xml.get("{%s}id" % xml_ns) is not None:
78 self.id = xml.get("{%s}id" % xml_ns)
79 elif xml.get("n") is not None:
80 self.id = xml.get("n")
81 else:
82 self.id = self.text
83 return
84 # If it is a witness detail (e.g., an ambiguous reading), then copy its target readings and witnesses, and recursively process its children:
85 if raw_tag == "witDetail":
86 # If it has a type, then save that; otherwise, default to "substantive":
87 self.type = xml.get("type") if xml.get("type") is not None else "substantive"
88 # Populate its list of target reading IDs in its target attribute (stripping any "#" prefixes), split over spaces:
89 self.targets = [t.strip("#") for t in xml.get("target").split()] if xml.get("target") is not None else []
90 # Populate its list of the entries in its wit attribute (stripping any "#" prefixes), split over spaces:
91 self.wits = [w.strip("#") for w in xml.get("wit").split()] if xml.get("wit") is not None else []
92 # Populate its certainties map and text recursively using its children:
93 self.certainties = {}
94 for t in self.targets:
95 self.certainties[t] = 0
96 self.text = xml.text if xml.text is not None else ""
97 for child in xml:
98 self.parse(child, verbose)
99 # Strip any surrounding whitespace left over from spaces added between word elements:
100 self.text = self.text.strip()
101 # Populate its ID, using its xml:id if it has one; otherwise, use its n attribute if it has one; otherwise, use its text:
102 self.id = ""
103 if xml.get("{%s}id" % xml_ns) is not None:
104 self.id = xml.get("{%s}id" % xml_ns)
105 elif xml.get("n") is not None:
106 self.id = xml.get("n")
107 else:
108 self.id = self.text
109 return
110 # If it is a certainty measurement, then store its value in this reading's certainties map
111 # (overwriting any previous values for this reading in the map, since they shouldn't be specified more than once):
112 if raw_tag == "certainty":
113 # Get its target reading IDs (stripping any "#" prefixes):
114 targets = [t.strip("#") for t in xml.get("target").split()] if xml.get("target") is not None else []
115 # Now set the entry for each target reading to that degree;
116 # if no degree is specified, then assume that all targets are equally likely and assign them all a value of 1 (we will normalize at the end):
117 degree = float(xml.get("degree")) if xml.get("degree") is not None else 1
118 for t in targets:
119 self.certainties[t] = degree
120 return
121 # If it is a word, then serialize its text and tail,
122 # recursively processing any subelements,
123 # and add a space after it:
124 if raw_tag == "w":
125 self.text += xml.text if xml.text is not None else ""
126 for child in xml:
127 self.parse(child, verbose)
128 self.text += xml.tail if xml.tail is not None else ""
129 self.text += " "
130 return
131 # If it is an abbreviation, then serialize its text and tail,
132 # recursively processing any subelements:
133 if raw_tag == "abbr":
134 self.text += xml.text if xml.text is not None else ""
135 for child in xml:
136 self.parse(child, verbose)
137 self.text += xml.tail if xml.tail is not None else ""
138 return
139 # If it is an overline-rendered element, then add an overline to each character in its contents:
140 if raw_tag == "hi":
141 # Keep track of how long the text currently is, so we can modify just the portion we're about to add:
142 starting_ind = len(self.text)
143 self.text += xml.text if xml.text is not None else ""
144 # for child in xml:
145 # self.parse(child, verbose)
146 # NOTE: other rendering types could be supported here
147 if xml.get("rend") is not None:
148 old_text = self.text[starting_ind:]
149 rend = xml.get("rend")
150 if rend == "overline":
151 new_text = "".join([c + "\u0305" for c in old_text])
152 self.text = self.text[:starting_ind] + new_text
153 self.text += xml.tail if xml.tail is not None else ""
154 return
155 # If it is a space, then serialize as a single space:
156 if raw_tag == "space":
157 text = "["
158 if xml.get("unit") is not None and xml.get("extent") is not None:
159 unit = xml.get("unit")
160 extent = xml.get("extent")
161 text += extent + " " + unit
162 text += " "
163 text += "space"
164 else:
165 text += "space"
166 if xml.get("reason") is not None:
167 text += " "
168 reason = xml.get("reason")
169 text += "(" + reason + ")"
170 text += "]"
171 text += xml.tail if xml.tail is not None else ""
172 self.text += text
173 return
174 # If it is an expansion, then serialize it in parentheses:
175 if raw_tag == "ex":
176 self.text += "("
177 self.text += xml.text if xml.text is not None else ""
178 # for child in xml:
179 # self.parse(child, verbose)
180 self.text += ")"
181 self.text += xml.tail if xml.tail is not None else ""
182 return
183 # If it is a gap, then serialize it based on its attributes:
184 if raw_tag == "gap":
185 text = ""
186 text += "["
187 if xml.get("unit") is not None and xml.get("extent") is not None:
188 unit = xml.get("unit")
189 extent = xml.get("extent")
190 text += extent + " " + unit
191 text += " "
192 text += "gap"
193 else:
194 text += "..." # placeholder text for gap if no unit and extent are specified
195 if xml.get("reason") is not None:
196 text += " "
197 reason = xml.get("reason")
198 text += "(" + reason + ")"
199 text += "]"
200 text += xml.tail if xml.tail is not None else ""
201 self.text += text
202 return
203 # If it is a supplied element, then recursively set the contents in brackets:
204 if raw_tag == "supplied":
205 self.text += "["
206 self.text += xml.text if xml.text is not None else ""
207 for child in xml:
208 self.parse(child, verbose)
209 self.text += "]"
210 self.text += xml.tail if xml.tail is not None else ""
211 return
212 # If it is an unclear element, then add an underdot to each character in its contents:
213 if raw_tag == "unclear":
214 # Keep track of how long the text currently is, so we can modify just the portion we're about to add:
215 starting_ind = len(self.text)
216 self.text += xml.text if xml.text is not None else ""
217 for child in xml:
218 self.parse(child, verbose)
219 old_text = self.text[
220 starting_ind:
221 ].strip() # strip any trailing spaces (in case there were entire words whose presence is unclear)
222 new_text = ""
223 # Add a dot under each character other than spaces:
224 for c in old_text:
225 new_text += c
226 if c != " ":
227 new_text += "\u0323"
228 self.text = self.text[:starting_ind] + new_text
229 self.text += xml.tail if xml.tail is not None else ""
230 return
231 # If it is a choice element, then recursively set the contents in brackets, separated by slashes:
232 if raw_tag == "choice":
233 self.text += "["
234 self.text += xml.text if xml.text is not None else ""
235 for child in xml:
236 self.parse(child, verbose)
237 self.text = self.text.strip() + "/" # add a slash between each possibility
238 self.text = self.text.strip("/") # remove the last one we added
239 self.text += "]"
240 self.text += xml.tail if xml.tail is not None else ""
241 return
242 # If it is a ref element, then set its text (stripped of "#" characters) in parentheses:
243 if raw_tag == "ref":
244 self.text += "("
245 self.text += xml.get("target").strip("#") if xml.get("target") is not None else ""
246 self.text += ")"
247 self.text += xml.tail if xml.tail is not None else ""
248 return