Coverage for teiphy/reading.py: 100.00%

1#!/usr/bin/env python3

3from lxml import etree as et

5from .common import xml_ns, tei_ns

8class Reading:

9 """Base class for storing TEI XML reading data internally.

11 This can correspond to a lem, rdg, or witDetail element in the collation.

13 Attributes:

14 id: The ID string of this reading, which should be unique within its parent app element.

15 type: A string representing the type of reading. Examples include "reconstructed", "defective", "orthographic", "subreading", "ambiguous", "overlap", and "lac". The default value is "substantive".

16 text: Serialization of the contents of this element.

17 wits: A list of sigla referring to witnesses that support this reading.

18 targets: A list of other reading ID strings to which this reading corresponds. For substantive readings, this should be empty. For ambiguous readings, it should contain references to the readings that might correspond to this one. For overlap readings, it should contain a reference to the reading from the overlapping variation unit responsible for the overlap.

19 certainties: A dictionary mapping target reading IDs to floating-point certainty values.

20 """

22 def __init__(self, xml: et.Element, verbose: bool = False):

23 """Constructs a new Reading instance from the TEI XML input.

25 Args:

26 xml: A lem, rdg, or witDetail element.

27 verbose: An optional flag indicating whether or not to print status updates.

28 """

29 self.type = ""

30 self.text = ""

31 self.id = ""

32 self.targets = []

33 self.certainties = {}

34 self.wits = []

35 self.parse(xml, verbose)

36 if verbose:

37 if len(self.wits) == 0:

38 if self.text != "":

39 print("New Reading %s with type %s, no witnesses, and text %s" % (self.id, self.type, self.text))

40 else:

41 print("New Reading %s with type %s, no witnesses, and no text" % (self.id, self.type))

42 else:

43 if self.text != "":

44 print(

45 "New Reading %s with type %s, witnesses %s, and text %s"

46 % (self.id, self.type, ", ".join([wit for wit in self.wits]), self.text)

47 )

48 else:

49 print(

50 "New Reading %s with type %s, witnesses %s, and no text"

51 % (self.id, self.type, ", ".join([wit for wit in self.wits]))

52 )

54 def parse(self, xml: et.Element, verbose: bool = False):

55 """Given an XML element, recursively parses it and its subelements.

57 Args:

58 xml: A lem, rdg, or witDetail element.

59 verbose: An optional flag indicating whether or not to print status updates.

60 """

61 # Determine what this element is:

62 raw_tag = xml.tag.replace("{%s}" % tei_ns, "")

63 # If it is a reading or lemma, then copy its witnesses, and recursively process its children:

64 if raw_tag in ["rdg", "lem"]:

65 # If it has a type, then save that; otherwise, default to "substantive":

66 self.type = xml.get("type") if xml.get("type") is not None else "substantive"

67 # Populate its list of the entries in its wit attribute (stripping any "#" prefixes), split over spaces:

68 self.wits = [w.strip("#") for w in xml.get("wit").split()] if xml.get("wit") is not None else []

69 # Populate its text recursively using its children:

70 self.text = xml.text if xml.text is not None else ""

71 for child in xml:

72 self.parse(child, verbose)

73 # Strip any surrounding whitespace left over from spaces added between word elements:

74 self.text = self.text.strip()

75 # Populate its ID, using its xml:id if it has one; otherwise, use its n attribute if it has one; otherwise, use its text:

76 self.id = ""

77 if xml.get("{%s}id" % xml_ns) is not None:

78 self.id = xml.get("{%s}id" % xml_ns)

79 elif xml.get("n") is not None:

80 self.id = xml.get("n")

81 else:

82 self.id = self.text

83 return

84 # If it is a witness detail (e.g., an ambiguous reading), then copy its target readings and witnesses, and recursively process its children:

85 if raw_tag == "witDetail":

86 # If it has a type, then save that; otherwise, default to "substantive":

87 self.type = xml.get("type") if xml.get("type") is not None else "substantive"

88 # Populate its list of target reading IDs in its target attribute (stripping any "#" prefixes), split over spaces:

89 self.targets = [t.strip("#") for t in xml.get("target").split()] if xml.get("target") is not None else []

90 # Populate its list of the entries in its wit attribute (stripping any "#" prefixes), split over spaces:

91 self.wits = [w.strip("#") for w in xml.get("wit").split()] if xml.get("wit") is not None else []

92 # Populate its certainties map and text recursively using its children:

93 self.certainties = {}

94 for t in self.targets:

95 self.certainties[t] = 0

96 self.text = xml.text if xml.text is not None else ""

97 for child in xml:

98 self.parse(child, verbose)

99 # Strip any surrounding whitespace left over from spaces added between word elements:

100 self.text = self.text.strip()

101 # Populate its ID, using its xml:id if it has one; otherwise, use its n attribute if it has one; otherwise, use its text:

102 self.id = ""

103 if xml.get("{%s}id" % xml_ns) is not None:

104 self.id = xml.get("{%s}id" % xml_ns)

105 elif xml.get("n") is not None:

106 self.id = xml.get("n")

107 else:

108 self.id = self.text

109 return

110 # If it is a certainty measurement, then store its value in this reading's certainties map

111 # (overwriting any previous values for this reading in the map, since they shouldn't be specified more than once):

112 if raw_tag == "certainty":

113 # Get its target reading IDs (stripping any "#" prefixes):

114 targets = [t.strip("#") for t in xml.get("target").split()] if xml.get("target") is not None else []

115 # Now set the entry for each target reading to that degree;

116 # if no degree is specified, then assume that all targets are equally likely and assign them all a value of 1 (we will normalize at the end):

117 degree = float(xml.get("degree")) if xml.get("degree") is not None else 1

118 for t in targets:

119 self.certainties[t] = degree

120 return

121 # If it is a word, then serialize its text and tail,

122 # recursively processing any subelements,

123 # and add a space after it:

124 if raw_tag == "w":

125 self.text += xml.text if xml.text is not None else ""

126 for child in xml:

127 self.parse(child, verbose)

128 self.text += xml.tail if xml.tail is not None else ""

129 self.text += " "

130 return

131 # If it is an abbreviation, then serialize its text and tail,

132 # recursively processing any subelements:

133 if raw_tag == "abbr":

134 self.text += xml.text if xml.text is not None else ""

135 for child in xml:

136 self.parse(child, verbose)

137 self.text += xml.tail if xml.tail is not None else ""

138 return

139 # If it is an overline-rendered element, then add an overline to each character in its contents:

140 if raw_tag == "hi":

141 # Keep track of how long the text currently is, so we can modify just the portion we're about to add:

142 starting_ind = len(self.text)

143 self.text += xml.text if xml.text is not None else ""

144 # for child in xml:

145 # self.parse(child, verbose)

146 # NOTE: other rendering types could be supported here

147 if xml.get("rend") is not None:

148 old_text = self.text[starting_ind:]

149 rend = xml.get("rend")

150 if rend == "overline":

151 new_text = "".join([c + "\u0305" for c in old_text])

152 self.text = self.text[:starting_ind] + new_text

153 self.text += xml.tail if xml.tail is not None else ""

154 return

155 # If it is a space, then serialize as a single space:

156 if raw_tag == "space":

157 text = "["

158 if xml.get("unit") is not None and xml.get("extent") is not None:

159 unit = xml.get("unit")

160 extent = xml.get("extent")

161 text += extent + " " + unit

162 text += " "

163 text += "space"

164 else:

165 text += "space"

166 if xml.get("reason") is not None:

167 text += " "

168 reason = xml.get("reason")

169 text += "(" + reason + ")"

170 text += "]"

171 text += xml.tail if xml.tail is not None else ""

172 self.text += text

173 return

174 # If it is an expansion, then serialize it in parentheses:

175 if raw_tag == "ex":

176 self.text += "("

177 self.text += xml.text if xml.text is not None else ""

178 # for child in xml:

179 # self.parse(child, verbose)

180 self.text += ")"

181 self.text += xml.tail if xml.tail is not None else ""

182 return

183 # If it is a gap, then serialize it based on its attributes:

184 if raw_tag == "gap":

185 text = ""

186 text += "["

187 if xml.get("unit") is not None and xml.get("extent") is not None:

188 unit = xml.get("unit")

189 extent = xml.get("extent")

190 text += extent + " " + unit

191 text += " "

192 text += "gap"

193 else:

194 text += "..." # placeholder text for gap if no unit and extent are specified

195 if xml.get("reason") is not None:

196 text += " "

197 reason = xml.get("reason")

198 text += "(" + reason + ")"

199 text += "]"

200 text += xml.tail if xml.tail is not None else ""

201 self.text += text

202 return

203 # If it is a supplied element, then recursively set the contents in brackets:

204 if raw_tag == "supplied":

205 self.text += "["

206 self.text += xml.text if xml.text is not None else ""

207 for child in xml:

208 self.parse(child, verbose)

209 self.text += "]"

210 self.text += xml.tail if xml.tail is not None else ""

211 return

212 # If it is an unclear element, then add an underdot to each character in its contents:

213 if raw_tag == "unclear":

214 # Keep track of how long the text currently is, so we can modify just the portion we're about to add:

215 starting_ind = len(self.text)

216 self.text += xml.text if xml.text is not None else ""

217 for child in xml:

218 self.parse(child, verbose)

219 old_text = self.text[

220 starting_ind:

221 ].strip() # strip any trailing spaces (in case there were entire words whose presence is unclear)

222 new_text = ""

223 # Add a dot under each character other than spaces:

224 for c in old_text:

225 new_text += c

226 if c != " ":

227 new_text += "\u0323"

228 self.text = self.text[:starting_ind] + new_text

229 self.text += xml.tail if xml.tail is not None else ""

230 return

231 # If it is a choice element, then recursively set the contents in brackets, separated by slashes:

232 if raw_tag == "choice":

233 self.text += "["

234 self.text += xml.text if xml.text is not None else ""

235 for child in xml:

236 self.parse(child, verbose)

237 self.text = self.text.strip() + "/" # add a slash between each possibility

238 self.text = self.text.strip("/") # remove the last one we added

239 self.text += "]"

240 self.text += xml.tail if xml.tail is not None else ""

241 return

242 # If it is a ref element, then set its text (stripped of "#" characters) in parentheses:

243 if raw_tag == "ref":

244 self.text += "("

245 self.text += xml.get("target").strip("#") if xml.get("target") is not None else ""

246 self.text += ")"

247 self.text += xml.tail if xml.tail is not None else ""

248 return