Coverage for teiphy/main.py: 100.00%

1from typing import List # for list-like inputs

2from importlib.metadata import version # for checking package version

3from pathlib import Path # for validating file address inputs

4from lxml import etree as et # for parsing XML input

5import typer

7from .format import Format

8from .collation import Collation, ClockModel, AncestralLogger, TableType, SplitMissingType, TransformMatrixType

11app = typer.Typer(rich_markup_mode="rich")

14def version_callback(value: bool):

15 if value:

16 teiphy_version = version("teiphy")

17 typer.echo(teiphy_version)

18 raise typer.Exit()

21@app.command()

22def to_file(

23 trivial_reading_types: List[str] = typer.Option(

24 [],

25 "-t",

26 help="Reading types to treat as trivial and collapse with the previous substantive reading (e.g., reconstructed, defective, orthographic, subreading). If more than one type is applicable, this argument can be specified multiple times.",

27 ),

28 missing_reading_types: List[str] = typer.Option(

29 [],

30 "-m",

31 help="Reading types to treat as missing data (e.g., lac, overlap). If more than one type is applicable, this argument can be specified multiple times.",

32 ),

33 suffixes: List[str] = typer.Option(

34 [],

35 "-s",

36 help="Suffixes to ignore for manuscript witness sigla. Typically, these will be things like the sigla for first hands (*) and main texts (T), although you may also wish to use it to combine multiple attestations (often signified by /1, /2 in lectionaries) in the same witness. If more than one suffix is used, this argument can be specified multiple times.",

37 ),

38 fill_correctors: bool = typer.Option(

39 False,

40 help="Fill in missing readings in witnesses with type \"corrector\" using the witnesses they follow in the TEI XML witness list.",

41 ),

42 labels: bool = typer.Option(

43 True,

44 help="Print the CharStateLabels block (containing variation unit labels and reading texts converted to ASCII) in NEXUS output.",

45 ),

46 frequency: bool = typer.Option(

47 False,

48 help="Use the StatesFormat=Frequency setting instead of the StatesFormat=StatesPresent setting (and thus represent all states with frequency vectors rather than symbols) in NEXUS output.",

49 ),

50 fragmentary_threshold: float = typer.Option(

51 None,

52 help="Ignore all witnesses that are extant at fewer than the specified proportion of variation units. For the purposes of this calculation, a witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the -m option). This calculation is performed after the reading sequences of correctors have been filled in (if the --fill-correctors flag was specified). Thus, a threshold of 0.7 means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.",

53 ),

54 fill_correctors_threshold: float = typer.Option(

55 None,

56 help="Do not fill in any correctors that have explicit readings at fewer than the specified proportion of variation units. For the purposes of this calculation, a corrector has an explicit reading at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the -m option). This option is only used if the --fill-correctors flag is specified.",

57 ),

58 drop_constant: bool = typer.Option(

59 False,

60 help="If set, do not write constant sites (i.e., variation units with one substantive reading) to output.",

61 ),

62 ambiguous_as_missing: bool = typer.Option(

63 False,

64 help="Use the missing symbol instead of multistate symbols (and thus treat all ambiguities as missing data) in NEXUS output; this option is only applied if the --frequency option is not set.",

65 ),

66 proportion: bool = typer.Option(

67 False,

68 help="If set, divide the output matrix's cells by the numnber of variation units where the row and column witnesses are both extant; this option is only used for witness-to-witness matrix outputs (e.g., tabular outputs of type \"distance\", \"similarity\", \"idf\", \"mi\", etc.).\nFor distance and similarity matrices, this option will transform disagreement or agreement counts to values between 0 and 1.\nFor IDF and MI matrices, this option will return the mean IDF or MI for each pair of witnesses over all variation units where both are extant.",

69 ),

70 calibrate_dates: bool = typer.Option(

71 False,

72 help="Add an Assumptions block containing age distributions for witnesses to NEXUS output; this option is intended for NEXUS inputs to BEAST 2.",

73 ),

74 mrbayes: bool = typer.Option(

75 False,

76 help="Add a MrBayes block containing model settings and age calibrations for witnesses to NEXUS output; this option is intended for inputs to MrBayes.",

77 ),

78 clock: ClockModel = typer.Option(

79 ClockModel.strict,

80 help="The clock model to use; this option is intended for inputs to MrBayes and BEAST 2. MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified.",

81 ),

82 ancestral_logger: AncestralLogger = typer.Option(

83 AncestralLogger.state,

84 help="The type of logger to use for ancestral state reconstruction data; this option is intended for inputs to BEAST 2. If \"state\", then only the reconstructed states at the root of each sampled tree will be logged. If \"sequence\", then each sampled tree's reconstructed states for all ancestors will be logged (WARNING: this will be memory-intensive!). If \"none\", then no ancestral states will be logged.",

85 ),

86 table: TableType = typer.Option(

87 TableType.matrix,

88 help="The type of table to use for CSV/TSV/Excel/PHYLIP output.\nIf \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option).\nIf \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option).\nIf \"similarity\", then the table will have rows and columns for witnesses, with the number or proportion of agreements between each pair in the corresponding cell (the --proportion flag can be used with this option).\nIf \"idf\", then the table will have rows and columns for witnesses, where each cell contains the sum or mean of inverse document frequency-weighted agreements between the corresponding pair of witnesses (the --proportion flag can be used with this option).\nIf \"mi\", then the table will have rows and columns for witnesses, where each cell contains the sum or mean of mutual information between the corresponding pair of witnesses over all variation units (the --proportion flag can be used with this option).\nIf \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option).\nIf \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.\nIf the output is a PHYLIP file, then the type of tabular output must be \"distance\" or \"similarity\"; otherwise, it will be ignored.",

89 ),

90 split_missing: SplitMissingType = typer.Option(

91 None,

92 help="Treat missing characters/variation units as having a contribution of 1 split over all states/readings.\nIf not specified, then missing data is ignored (i.e., all states are 0).\nIf \"uniform\", then the contribution of 1 is divided evenly over all substantive readings.\nIf \"proportional\", then the contribution of 1 is divided between the readings in proportion to their support among the witnesses that are not missing.\nNot applicable for non-tabular formats.",

93 ),

94 transform_matrix: TransformMatrixType = typer.Option(

95 None,

96 help="Transform the columns of a witness-to-witness matrix output (e.g., a tabular output of type \"distance\", \"similarity\", \"idf\", \"mi\", etc.).\nIf \"stddev\", then the mean and standard deviation of each column are calculated and each value in that column is replaced with the number of standard deviations it is from the mean.\nIf \"mad\", then the median and median absolute deviation (MAD) of each column are calculated and each value in that column is replaced with the number of MADs it is from the median.\nThe transformation is applied after the --split-missing and --proportion options are applied.",

97 ),

98 show_ext: bool = typer.Option(

99 False,

100 help="If set, each cell of a witness-to-witness matrix output (e.g., a tabular output of type \"distance\", \"similarity\", \"idf\", \"mi\", etc.) will display the cell's value, followed by the number of variation units where both witnesses are extant and have unambiguous readings.\n(For example, a cell containing 47/50 in a similarity table would indicate that the row and column witnesses agree at 47 of the 50 units where they both have readings.)",

101 ),

102 seed: int = typer.Option(

103 None,

104 help="Seed for random number generation (used for setting default initial values of transcriptional rate parameters for BEAST 2 XML output); if not specified, then the default seeding of the numpy.random.default_rng class will be used.",

105 ),

106 verbose: bool = typer.Option(False, help="Enable verbose logging (mostly for debugging purposes)."),

107 version: bool = typer.Option(

108 False,

109 callback=version_callback,

110 is_eager=True,

111 help="Print the current version.",

112 ),

113 format: Format = typer.Option(None, case_sensitive=False, help="The output format."),

114 dates_file: Path = typer.Option(

115 None,

116 exists=True,

117 file_okay=True,

118 dir_okay=False,

119 writable=False,

120 readable=True,

121 resolve_path=True,

122 help="CSV file containing witness IDs in the first column and minimum and maximum dates for those witnesses in the next two columns. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.",

123 ),

124 input: Path = typer.Argument(

125 ...,

126 exists=True,

127 file_okay=True,

128 dir_okay=False,

129 writable=False,

130 readable=True,

131 resolve_path=True,

132 help="Input TEI XML collation file to convert.",

133 ),

134 output: Path = typer.Argument(

135 ...,

136 exists=False,

137 file_okay=True,

138 dir_okay=False,

139 writable=True,

140 readable=False,

141 resolve_path=True,

142 help="Output for converted collation. If --format is not specified, then the format will be derived from the extension of this file.",

143 ),

144):

145 # Make sure the input is an XML file:

146 if input.suffix.lower() != ".xml":

147 print("Error opening input file: The input file is not an XML file. Make sure the input file type is .xml.")

148 exit(1)

149 # If it is, then try to parse it:

150 xml = None

151 try:

152 parser = et.XMLParser(remove_comments=True)

153 xml = et.parse(input, parser=parser)

154 except Exception as err:

155 print(f"Error opening input file: {err}")

156 exit(1)

157 # Make sure the fragmentary_threshold input, if specified, is between 0 and 1:

158 if fragmentary_threshold is not None and (fragmentary_threshold < 0.0 or fragmentary_threshold > 1.0):

159 print(

160 "Error: the fragmentary variation unit proportion threshold is %f. It must be a value in [0, 1]."

161 % fragmentary_threshold

162 )

163 exit(1)

164 # Make sure the fill_correctors_threshold input, if specified, is between 0 and 1:

165 if fill_correctors_threshold is not None and (fill_correctors_threshold < 0.0 or fill_correctors_threshold > 1.0):

166 print(

167 "Error: the variation unit proportion threshold for filling correctors is %f. It must be a value in [0, 1]."

168 % fill_correctors_threshold

169 )

170 exit(1)

171 # Make sure the dates_file input, if specified, is a CSV file:

172 if dates_file is not None and dates_file.suffix.lower() != ".csv":

173 print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")

174 exit(1)

175 coll = Collation(

176 xml,

177 suffixes,

178 trivial_reading_types,

179 missing_reading_types,

180 fill_correctors,

181 fragmentary_threshold,

182 fill_correctors_threshold,

183 dates_file,

184 verbose,

185 )

186 coll.to_file(

187 output,

188 format=format,

189 drop_constant=drop_constant,

190 char_state_labels=labels,

191 frequency=frequency,

192 ambiguous_as_missing=ambiguous_as_missing,

193 proportion=proportion,

194 calibrate_dates=calibrate_dates,

195 mrbayes=mrbayes,

196 clock_model=clock,

197 ancestral_logger=ancestral_logger,

198 table_type=table,

199 split_missing=split_missing,

200 transform_matrix=transform_matrix,

201 show_ext=show_ext,

202 seed=seed,

203 )