Coverage for teiphy/variation_unit.py: 100.00%

114 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-01-15 16:06 +0000

1#!/usr/bin/env python3 

2 

3from lxml import etree as et # for reading TEI XML inputs 

4 

5from .common import xml_ns, tei_ns 

6from .reading import Reading 

7 

8 

9class VariationUnit: 

10 """Base class for storing TEI XML variation unit data internally. 

11 

12 This corresponds to an app element in the collation. 

13 

14 Attributes: 

15 id: The ID string of this variation unit, which should be unique. 

16 readings: A list of Readings contained in this VariationUnit. 

17 intrinsic_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to the intrinsic odds category 

18 describing the two readings' relative probability of being authorial. 

19 transcriptional_relations: A dictionary mapping pairs of IDs of Readings in this VariationUnit to a set of transcriptional change categories 

20 that could explain the rise of the second reading from the first. 

21 """ 

22 

23 def __init__(self, xml: et.Element, verbose: bool = False): 

24 """Constructs a new VariationUnit instance from the TEI XML input. 

25 

26 Args: 

27 xml: An lxml.etree.Element representing an app element. 

28 verbose: An optional boolean flag indicating whether or not to print status updates. 

29 """ 

30 # Use its xml:id if it has one; otherwise, use its n, from, and to attributes if it has them: 

31 self.id = "" 

32 if xml.get("{%s}id" % xml_ns) is not None: 

33 self.id = xml.get("{%s}id" % xml_ns) 

34 elif xml.get("n") is not None: 

35 self.id = xml.get("n") 

36 if xml.get("from") is not None and xml.get("to") is not None: 

37 self.id += "_" + xml.get("from") + "_" + xml.get("to") 

38 # Initialize its list of readings: 

39 self.readings = [] 

40 # Initialize its dictionaries of intrinsic and transcriptional relations: 

41 self.intrinsic_relations = {} 

42 self.transcriptional_relations_by_date_range = {} 

43 # Now parse the app element to populate these data structures: 

44 self.parse(xml, verbose) 

45 if verbose: 

46 print("New VariationUnit %s with %d readings" % (self.id, len(self.readings))) 

47 

48 def __str__(self): 

49 return self.id 

50 

51 def __repr__(self): 

52 return str(self) 

53 

54 def parse(self, xml: et.Element, verbose: bool = False): 

55 """Given an XML element, recursively parses its subelements for readings, reading groups, and witness details. 

56 

57 Other children of app elements, such as note, noteGrp, and wit elements, are ignored. 

58 

59 Args: 

60 xml: An lxml.etree.Element representing an app element. 

61 verbose: An optional boolean flag indicating whether or not to print status updates. 

62 """ 

63 # Determine what this element is: 

64 raw_tag = xml.tag.replace("{%s}" % tei_ns, "") 

65 # If it is an apparatus, then initialize the readings list and process the child elements of the apparatus recursively: 

66 if raw_tag == "app": 

67 self.readings = [] 

68 for child in xml: 

69 self.parse(child, verbose) 

70 return 

71 # If it is a reading group, then flatten it by processing its children recursively, applying the reading group's type to the readings contained in it: 

72 if raw_tag == "rdgGrp": 

73 # Get the type of this reading group: 

74 reading_group_type = xml.get("type") if xml.get("type") is not None else None 

75 for child in xml: 

76 child_raw_tag = child.tag.replace("{%s}" % tei_ns, "") 

77 if child_raw_tag in [ 

78 "lem", 

79 "rdg", 

80 ]: # any <lem> element in a <rdgGrp> can be assumed not to be a duplicate of a <rdg> element, as there should be only one <lem> at all levels under an <app> element 

81 rdg = Reading(child, verbose) 

82 if rdg.type is not None: 

83 rdg.type = reading_group_type 

84 self.readings.append(rdg) 

85 else: 

86 self.parse(child, verbose) 

87 return 

88 # If it is a lemma, then add it as a reading if has its own witness list (even if that list is empty, which may occur for a conjecture); 

89 # otherwise, assume its reading is duplicated in a <rdg> element and skip it: 

90 if raw_tag == "lem": 

91 if xml.get("wit") is not None: 

92 lem = Reading(xml, verbose) 

93 self.readings.append(lem) 

94 return 

95 # If it is a reading, then add it as a reading: 

96 elif raw_tag == "rdg": 

97 rdg = Reading(xml, verbose) 

98 self.readings.append(rdg) 

99 return 

100 # If it is a witness detail, then add it as a reading, and if it does not have any targeted readings, then add a target for the previous rdg element (per the TEI Guidelines §12.1.4.1): 

101 elif raw_tag == "witDetail": 

102 witDetail = Reading(xml, verbose) 

103 if len(witDetail.targets) == 0 and len(self.readings) > 0: 

104 previous_rdg_ind = -1 

105 previous_rdg = self.readings[previous_rdg_ind] 

106 while len(previous_rdg.targets) > 0: 

107 previous_rdg_ind -= 1 

108 previous_rdg = self.readings[previous_rdg_ind] 

109 witDetail.targets.append(previous_rdg.id) 

110 witDetail.certainties[previous_rdg.id] = 1 

111 self.readings.append(witDetail) 

112 return 

113 # If it is a note, then process the child elements of the note recursively: 

114 elif raw_tag == "note": 

115 for child in xml: 

116 self.parse(child, verbose) 

117 return 

118 # If it is a list of relations, then populate the corresponding dictionary: 

119 elif raw_tag == "listRelation": 

120 if xml.get("type") is not None and xml.get("type") == "intrinsic": 

121 self.intrinsic_relations = {} 

122 for child in xml: 

123 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None: 

124 continue 

125 from_readings = child.get("active").replace("#", "").split() 

126 to_readings = child.get("passive").replace("#", "").split() 

127 intrinsic_category = ( 

128 child.get("ana").replace("#", "").split()[0] 

129 ) # there shouldn't be more than one of these 

130 # For each pair of readings, assign them the specified category 

131 for from_reading in from_readings: 

132 for to_reading in to_readings: 

133 pair = (from_reading, to_reading) 

134 self.intrinsic_relations[pair] = intrinsic_category 

135 return 

136 if xml.get("type") is not None and xml.get("type") == "transcriptional": 

137 self.transcriptional_relations_by_date_range = {} 

138 # In a first pass, gather any dates specified for transcriptional relations and sort them in a list: 

139 unique_date_strings = set() 

140 for child in xml: 

141 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None: 

142 continue 

143 if child.get("notBefore") is not None: 

144 unique_date_strings.add(child.get("notBefore")) 

145 if child.get("notAfter") is not None: 

146 unique_date_strings.add(child.get("notAfter")) 

147 threshold_dates = sorted([int(date_string) for date_string in unique_date_strings]) 

148 # Then add null entries corresponding to periods before and after the first and last specified dates, respectively: 

149 threshold_dates = [None] + threshold_dates + [None] 

150 # Then initialize the output dictionary to map each pair of consecutive dates 

151 # to a dictionary of the transcriptional relations that hold between them: 

152 for i in range(len(threshold_dates) - 1): 

153 self.transcriptional_relations_by_date_range[(threshold_dates[i], threshold_dates[i + 1])] = {} 

154 # Then, in a second pass, populate a map from (active, passive) reading tuples to their transcriptional categories for each consecutive pairs of dates: 

155 for child in xml: 

156 if child.get("active") is None or child.get("passive") is None or child.get("ana") is None: 

157 continue 

158 from_readings = child.get("active").replace("#", "").split() 

159 to_readings = child.get("passive").replace("#", "").split() 

160 transcriptional_categories = child.get("ana").replace("#", "").split() 

161 date_index_range = [0, len(threshold_dates) - 1] 

162 if child.get("notBefore") is not None: 

163 date_index_range[0] = threshold_dates.index(int(child.get("notBefore"))) 

164 if child.get("notAfter") is not None: 

165 date_index_range[1] = threshold_dates.index(int(child.get("notAfter"))) 

166 for i in range(date_index_range[0], date_index_range[1]): 

167 # For each pair of readings, assign them to the specified category or categories: 

168 for from_reading in from_readings: 

169 for to_reading in to_readings: 

170 pair = (from_reading, to_reading) 

171 if ( 

172 pair 

173 not in self.transcriptional_relations_by_date_range[ 

174 (threshold_dates[i], threshold_dates[i + 1]) 

175 ] 

176 ): 

177 self.transcriptional_relations_by_date_range[ 

178 (threshold_dates[i], threshold_dates[i + 1]) 

179 ][ 

180 pair 

181 ] = set() # we only need distinct categories for each transition 

182 for transcriptional_category in transcriptional_categories: 

183 self.transcriptional_relations_by_date_range[ 

184 (threshold_dates[i], threshold_dates[i + 1]) 

185 ][pair].add(transcriptional_category) 

186 return 

187 return