Coverage for teiphy/main.py: 100.00%

1from typing import List # for list-like inputs

2from importlib.metadata import version # for checking package version

3from pathlib import Path # for validating file address inputs

4from lxml import etree as et # for parsing XML input

5import typer

7from .format import Format

8from .collation import Collation, ClockModel, AncestralLogger, TableType

11app = typer.Typer(rich_markup_mode="rich")

14def version_callback(value: bool):

15 if value:

16 teiphy_version = version("teiphy")

17 typer.echo(teiphy_version)

18 raise typer.Exit()

21@app.command()

22def to_file(

23 trivial_reading_types: List[str] = typer.Option(

24 [],

25 "-t",

26 help="Reading types to treat as trivial and collapse with the previous substantive reading (e.g., reconstructed, defective, orthographic, subreading). If more than one type is applicable, this argument can be specified multiple times.",

27 ),

28 missing_reading_types: List[str] = typer.Option(

29 [],

30 "-m",

31 help="Reading types to treat as missing data (e.g., lac, overlap). If more than one type is applicable, this argument can be specified multiple times.",

32 ),

33 suffixes: List[str] = typer.Option(

34 [],

35 "-s",

36 help="Suffixes to ignore for manuscript witness sigla. Typically, these will be things like the sigla for first hands (*) and main texts (T), although you may also wish to use it to combine multiple attestations (often signified by /1, /2 in lectionaries) in the same witness. If more than one suffix is used, this argument can be specified multiple times.",

37 ),

38 fill_correctors: bool = typer.Option(

39 False,

40 help="Fill in missing readings in witnesses with type \"corrector\" using the witnesses they follow in the TEI XML witness list.",

41 ),

42 labels: bool = typer.Option(

43 True,

44 help="Print the CharStateLabels block (containing variation unit labels and reading texts converted to ASCII) in NEXUS output.",

45 ),

46 frequency: bool = typer.Option(

47 False,

48 help="Use the StatesFormat=Frequency setting instead of the StatesFormat=StatesPresent setting (and thus represent all states with frequency vectors rather than symbols) in NEXUS output.",

49 ),

50 fragmentary_threshold: float = typer.Option(

51 None,

52 help="Ignore all witnesses that are extant at fewer than the specified proportion of variation units. For the purposes of this calculation, a witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the -m option). This calculation is performed after the reading sequences of correctors have been filled in (if the --fill-correctors flag was specified). Thus, a threshold of 0.7 means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.",

53 ),

54 drop_constant: bool = typer.Option(

55 False,

56 help="If set, do not write constant sites (i.e., variation units with one substantive reading) to output.",

57 ),

58 ambiguous_as_missing: bool = typer.Option(

59 False,

60 help="Use the missing symbol instead of multistate symbols (and thus treat all ambiguities as missing data) in NEXUS output; this option is only applied if the --frequency option is not set.",

61 ),

62 proportion: bool = typer.Option(

63 False,

64 help="If set, populate the output distance matrix's cells with proportions of disagreements over variation units where both witnesses are extant; this option is only used if --table distance is specified.",

65 ),

66 calibrate_dates: bool = typer.Option(

67 False,

68 help="Add an Assumptions block containing age distributions for witnesses to NEXUS output; this option is intended for NEXUS inputs to BEAST 2.",

69 ),

70 mrbayes: bool = typer.Option(

71 False,

72 help="Add a MrBayes block containing model settings and age calibrations for witnesses to NEXUS output; this option is intended for inputs to MrBayes.",

73 ),

74 clock: ClockModel = typer.Option(

75 ClockModel.strict,

76 help="The clock model to use; this option is intended for inputs to MrBayes and BEAST 2. MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified.",

77 ),

78 ancestral_logger: AncestralLogger = typer.Option(

79 AncestralLogger.state,

80 help="The type of logger to use for ancestral state reconstruction data; this option is intended for inputs to BEAST 2. If \"state\", then only the reconstructed states at the root of each sampled tree will be logged. If \"sequence\", then each sampled tree's reconstructed states for all ancestors will be logged (WARNING: this will be memory-intensive!). If \"none\", then no ancestral states will be logged.",

81 ),

82 table: TableType = typer.Option(

83 TableType.matrix,

84 help="The type of table to use for CSV/Excel output. If \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option). If \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"similarity\", then the table will have rows and columns for witnesses, with the number or proportion of agreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option). If \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.",

85 ),

86 split_missing: bool = typer.Option(

87 False,

88 help="Treat missing characters/variation units as having a contribution of 1 split over all states/readings; if False, then missing data is ignored (i.e., all states are 0). Not applicable for non-tabular formats.",

89 ),

90 show_ext: bool = typer.Option(

91 False,

92 help="If set, each cell in a distance or similarity matrix will display the count/proportion of disagreements/agreements, followed by the number of variation units where both witnesses are extant and have unambiguous readings. (For example, a cell containing 47/50 in a similarity table would indicate that the row and column witnesses agree at 47 of the 50 units where they both have readings.) This option is only valid for tabular output formats of type \"distance\" or \"similarity\".",

93 ),

94 seed: int = typer.Option(

95 None,

96 help="Seed for random number generation (used for setting default initial values of transcriptional rate parameters for BEAST 2 XML output); if not specified, then the default seeding of the numpy.random.default_rng class will be used.",

97 ),

98 verbose: bool = typer.Option(False, help="Enable verbose logging (mostly for debugging purposes)."),

99 version: bool = typer.Option(

100 False,

101 callback=version_callback,

102 is_eager=True,

103 help="Print the current version.",

104 ),

105 format: Format = typer.Option(None, case_sensitive=False, help="The output format."),

106 dates_file: Path = typer.Option(

107 None,

108 exists=True,

109 file_okay=True,

110 dir_okay=False,

111 writable=False,

112 readable=True,

113 resolve_path=True,

114 help="CSV file containing witness IDs in the first column and minimum and maximum dates for those witnesses in the next two columns. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.",

115 ),

116 input: Path = typer.Argument(

117 ...,

118 exists=True,

119 file_okay=True,

120 dir_okay=False,

121 writable=False,

122 readable=True,

123 resolve_path=True,

124 help="Input TEI XML collation file to convert.",

125 ),

126 output: Path = typer.Argument(

127 ...,

128 exists=False,

129 file_okay=True,

130 dir_okay=False,

131 writable=True,

132 readable=False,

133 resolve_path=True,

134 help="Output for converted collation. If --format is not specified, then the format will be derived from the extension of this file.",

135 ),

136):

137 # Make sure the input is an XML file:

138 if input.suffix.lower() != ".xml":

139 print("Error opening input file: The input file is not an XML file. Make sure the input file type is .xml.")

140 exit(1)

141 # If it is, then try to parse it:

142 xml = None

143 try:

144 parser = et.XMLParser(remove_comments=True)

145 xml = et.parse(input, parser=parser)

146 except Exception as err:

147 print(f"Error opening input file: {err}")

148 exit(1)

149 # Make sure the fragmentary_threshold input, if specified, is between 0 and 1:

150 if fragmentary_threshold is not None and (fragmentary_threshold < 0.0 or fragmentary_threshold > 1.0):

151 print(

152 "Error: the fragmentary variation unit proportion threshold is %f. It must be a value in [0, 1]."

153 % fragmentary_threshold

154 )

155 exit(1)

156 # Make sure the dates_file input, if specified, is a CSV file:

157 if dates_file is not None and dates_file.suffix.lower() != ".csv":

158 print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")

159 exit(1)

160 coll = Collation(

161 xml,

162 suffixes,

163 trivial_reading_types,

164 missing_reading_types,

165 fill_correctors,

166 fragmentary_threshold,

167 dates_file,

168 verbose,

169 )

170 coll.to_file(

171 output,

172 format=format,

173 drop_constant=drop_constant,

174 char_state_labels=labels,

175 frequency=frequency,

176 ambiguous_as_missing=ambiguous_as_missing,

177 proportion=proportion,

178 calibrate_dates=calibrate_dates,

179 mrbayes=mrbayes,

180 clock_model=clock,

181 ancestral_logger=ancestral_logger,

182 table_type=table,

183 split_missing=split_missing,

184 show_ext=show_ext,

185 seed=seed,

186 )