Coverage for teiphy/variation_unit.py: 100.00%

119 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-11 10:54 +0000

1#!/usr/bin/env python3 

2 

3from lxml import etree as et # for reading TEI XML inputs 

4 

5from .common import xml_ns, tei_ns 

6from .reading import Reading 

7 

8 

9class VariationUnit: 

10 """Base class for storing TEI XML variation unit data internally. 

11 

12 This corresponds to an app element in the collation. 

13 

14 Attributes: 

15 id: The ID string of this variation unit, which should be unique. 

16 readings: A list of Readings contained in this VariationUnit. 

17 intrinsic_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to the intrinsic odds category 

18 describing the two readings' relative probability of being authorial. 

19 transcriptional_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to a set of transcriptional change categories 

20 that could explain the rise of the second reading from the first. 

21 """ 

22 

23 def __init__(self, xml: et.Element, verbose: bool = False): 

24 """Constructs a new VariationUnit instance from the TEI XML input. 

25 

26 Args: 

27 xml: An lxml.etree.Element representing an app element. 

28 verbose: An optional boolean flag indicating whether or not to print status updates. 

29 """ 

30 # Use its xml:id if it has one; otherwise, use its n, from, and to attributes if it has them: 

31 self.id = "" 

32 if xml.get("{%s}id" % xml_ns) is not None: 

33 self.id = xml.get("{%s}id" % xml_ns) 

34 elif xml.get("n") is not None: 

35 self.id = xml.get("n") 

36 if xml.get("from") is not None and xml.get("to") is not None: 

37 self.id += "_" + xml.get("from") + "_" + xml.get("to") 

38 # Initialize its list of analysis categories (for weighting changes in this unit in the stemma program): 

39 self.analysis_categories = [] 

40 if xml.get("ana") is not None: 

41 ana_split = xml.get("ana").split() 

42 for ana in ana_split: 

43 self.analysis_categories.append(ana.strip("#")) # remove any leading hashtags 

44 # Initialize its list of readings: 

45 self.readings = [] 

46 # Initialize its dictionaries of intrinsic and transcriptional relations: 

47 self.intrinsic_relations = {} 

48 self.transcriptional_relations_by_date_range = {} 

49 # Now parse the app element to populate these data structures: 

50 self.parse(xml, verbose) 

51 if verbose: 

52 print("New VariationUnit %s with %d readings" % (self.id, len(self.readings))) 

53 

54 def __str__(self): 

55 return self.id 

56 

57 def __repr__(self): 

58 return str(self) 

59 

60 def parse(self, xml: et.Element, verbose: bool = False): 

61 """Given an XML element, recursively parses its subelements for readings, reading groups, and witness details. 

62 

63 Other children of app elements, such as note, noteGrp, and wit elements, are ignored. 

64 

65 Args: 

66 xml: An lxml.etree.Element representing an app element. 

67 verbose: An optional boolean flag indicating whether or not to print status updates. 

68 """ 

69 # Determine what this element is: 

70 raw_tag = xml.tag.replace("{%s}" % tei_ns, "") 

71 # If it is an apparatus, then initialize the readings list and process the child elements of the apparatus recursively: 

72 if raw_tag == "app": 

73 self.readings = [] 

74 for child in xml: 

75 self.parse(child, verbose) 

76 return 

77 # If it is a reading group, then flatten it by processing its children recursively, applying the reading group's type to the readings contained in it: 

78 if raw_tag == "rdgGrp": 

79 # Get the type of this reading group: 

80 reading_group_type = xml.get("type") if xml.get("type") is not None else None 

81 for child in xml: 

82 child_raw_tag = child.tag.replace("{%s}" % tei_ns, "") 

83 if child_raw_tag in [ 

84 "lem", 

85 "rdg", 

86 ]: # any <lem> element in a <rdgGrp> can be assumed not to be a duplicate of a <rdg> element, as there should be only one <lem> at all levels under an <app> element 

87 rdg = Reading(child, verbose) 

88 if rdg.type is not None: 

89 rdg.type = reading_group_type 

90 self.readings.append(rdg) 

91 else: 

92 self.parse(child, verbose) 

93 return 

94 # If it is a lemma, then add it as a reading if has its own witness list (even if that list is empty, which may occur for a conjecture); 

95 # otherwise, assume its reading is duplicated in a <rdg> element and skip it: 

96 if raw_tag == "lem": 

97 if xml.get("wit") is not None: 

98 lem = Reading(xml, verbose) 

99 self.readings.append(lem) 

100 return 

101 # If it is a reading, then add it as a reading: 

102 elif raw_tag == "rdg": 

103 rdg = Reading(xml, verbose) 

104 self.readings.append(rdg) 

105 return 

106 # If it is a witness detail, then add it as a reading, and if it does not have any targeted readings, then add a target for the previous rdg element (per the TEI Guidelines §12.1.4.1): 

107 elif raw_tag == "witDetail": 

108 witDetail = Reading(xml, verbose) 

109 if len(witDetail.targets) == 0 and len(self.readings) > 0: 

110 previous_rdg_ind = -1 

111 previous_rdg = self.readings[previous_rdg_ind] 

112 while len(previous_rdg.targets) > 0: 

113 previous_rdg_ind -= 1 

114 previous_rdg = self.readings[previous_rdg_ind] 

115 witDetail.targets.append(previous_rdg.id) 

116 witDetail.certainties[previous_rdg.id] = 1 

117 self.readings.append(witDetail) 

118 return 

119 # If it is a note, then process the child elements of the note recursively: 

120 elif raw_tag == "note": 

121 for child in xml: 

122 self.parse(child, verbose) 

123 return 

124 # If it is a list of relations, then populate the corresponding dictionary: 

125 elif raw_tag == "listRelation": 

126 if xml.get("type") is not None and xml.get("type") == "intrinsic": 

127 self.intrinsic_relations = {} 

128 for child in xml: 

129 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None: 

130 continue 

131 from_readings = child.get("active").replace("#", "").split() 

132 to_readings = child.get("passive").replace("#", "").split() 

133 intrinsic_category = ( 

134 child.get("ana").replace("#", "").split()[0] 

135 ) # there shouldn't be more than one of these 

136 # For each pair of readings, assign them the specified category 

137 for from_reading in from_readings: 

138 for to_reading in to_readings: 

139 pair = (from_reading, to_reading) 

140 self.intrinsic_relations[pair] = intrinsic_category 

141 return 

142 if xml.get("type") is not None and xml.get("type") == "transcriptional": 

143 self.transcriptional_relations_by_date_range = {} 

144 # In a first pass, gather any dates specified for transcriptional relations and sort them in a list: 

145 unique_date_strings = set() 

146 for child in xml: 

147 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None: 

148 continue 

149 if child.get("notBefore") is not None: 

150 unique_date_strings.add(child.get("notBefore")) 

151 if child.get("notAfter") is not None: 

152 unique_date_strings.add(child.get("notAfter")) 

153 threshold_dates = sorted([int(date_string) for date_string in unique_date_strings]) 

154 # Then add null entries corresponding to periods before and after the first and last specified dates, respectively: 

155 threshold_dates = [None] + threshold_dates + [None] 

156 # Then initialize the output dictionary to map each pair of consecutive dates 

157 # to a dictionary of the transcriptional relations that hold between them: 

158 for i in range(len(threshold_dates) - 1): 

159 self.transcriptional_relations_by_date_range[(threshold_dates[i], threshold_dates[i + 1])] = {} 

160 # Then, in a second pass, populate a map from (active, passive) reading tuples to their transcriptional categories for each consecutive pairs of dates: 

161 for child in xml: 

162 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None: 

163 continue 

164 from_readings = child.get("active").replace("#", "").split() 

165 to_readings = child.get("passive").replace("#", "").split() 

166 transcriptional_categories = child.get("ana").replace("#", "").split() 

167 date_index_range = [0, len(threshold_dates) - 1] 

168 if child.get("notBefore") is not None: 

169 date_index_range[0] = threshold_dates.index(int(child.get("notBefore"))) 

170 if child.get("notAfter") is not None: 

171 date_index_range[1] = threshold_dates.index(int(child.get("notAfter"))) 

172 for i in range(date_index_range[0], date_index_range[1]): 

173 # For each pair of readings, assign them to the specified category or categories: 

174 for from_reading in from_readings: 

175 for to_reading in to_readings: 

176 pair = (from_reading, to_reading) 

177 if ( 

178 pair 

179 not in self.transcriptional_relations_by_date_range[ 

180 (threshold_dates[i], threshold_dates[i + 1]) 

181 ] 

182 ): 

183 self.transcriptional_relations_by_date_range[ 

184 (threshold_dates[i], threshold_dates[i + 1]) 

185 ][ 

186 pair 

187 ] = set() # we only need distinct categories for each transition 

188 for transcriptional_category in transcriptional_categories: 

189 self.transcriptional_relations_by_date_range[ 

190 (threshold_dates[i], threshold_dates[i + 1]) 

191 ][pair].add(transcriptional_category) 

192 return 

193 return