Coverage for teiphy/main.py: 100.00%
33 statements
« prev ^ index » next coverage.py v7.9.2, created at 2026-04-20 19:45 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2026-04-20 19:45 +0000
1from typing import List # for list-like inputs
2from importlib.metadata import version # for checking package version
3from pathlib import Path # for validating file address inputs
4from lxml import etree as et # for parsing XML input
5import typer
7from .format import Format
8from .collation import Collation, ClockModel, AncestralLogger, TableType, SplitMissingType, TransformMatrixType
11app = typer.Typer(rich_markup_mode="rich")
14def version_callback(value: bool):
15 if value:
16 teiphy_version = version("teiphy")
17 typer.echo(teiphy_version)
18 raise typer.Exit()
21@app.command()
22def to_file(
23 trivial_reading_types: List[str] = typer.Option(
24 [],
25 "-t",
26 help="Reading types to treat as trivial and collapse with the previous substantive reading (e.g., reconstructed, defective, orthographic, subreading). If more than one type is applicable, this argument can be specified multiple times.",
27 ),
28 missing_reading_types: List[str] = typer.Option(
29 [],
30 "-m",
31 help="Reading types to treat as missing data (e.g., lac, overlap). If more than one type is applicable, this argument can be specified multiple times.",
32 ),
33 suffixes: List[str] = typer.Option(
34 [],
35 "-s",
36 help="Suffixes to ignore for manuscript witness sigla. Typically, these will be things like the sigla for first hands (*) and main texts (T), although you may also wish to use it to combine multiple attestations (often signified by /1, /2 in lectionaries) in the same witness. If more than one suffix is used, this argument can be specified multiple times.",
37 ),
38 fill_correctors: bool = typer.Option(
39 False,
40 help="Fill in missing readings in witnesses with type \"corrector\" using the witnesses they follow in the TEI XML witness list.",
41 ),
42 labels: bool = typer.Option(
43 True,
44 help="Print the CharStateLabels block (containing variation unit labels and reading texts converted to ASCII) in NEXUS output.",
45 ),
46 frequency: bool = typer.Option(
47 False,
48 help="Use the StatesFormat=Frequency setting instead of the StatesFormat=StatesPresent setting (and thus represent all states with frequency vectors rather than symbols) in NEXUS output.",
49 ),
50 fragmentary_threshold: float = typer.Option(
51 None,
52 help="Ignore all witnesses that are extant at fewer than the specified proportion of variation units. For the purposes of this calculation, a witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the -m option). This calculation is performed after the reading sequences of correctors have been filled in (if the --fill-correctors flag was specified). Thus, a threshold of 0.7 means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.",
53 ),
54 drop_constant: bool = typer.Option(
55 False,
56 help="If set, do not write constant sites (i.e., variation units with one substantive reading) to output.",
57 ),
58 ambiguous_as_missing: bool = typer.Option(
59 False,
60 help="Use the missing symbol instead of multistate symbols (and thus treat all ambiguities as missing data) in NEXUS output; this option is only applied if the --frequency option is not set.",
61 ),
62 proportion: bool = typer.Option(
63 False,
64 help="If set, divide the output matrix's cells by the numnber of variation units where the row and column witnesses are both extant; this option is only used for witness-to-witness matrix outputs (e.g., tabular outputs of type \"distance\", \"similarity\", \"idf\", \"mi\", etc.).\nFor distance and similarity matrices, this option will transform disagreement or agreement counts to values between 0 and 1.\nFor IDF and MI matrices, this option will return the mean IDF or MI for each pair of witnesses over all variation units where both are extant.",
65 ),
66 calibrate_dates: bool = typer.Option(
67 False,
68 help="Add an Assumptions block containing age distributions for witnesses to NEXUS output; this option is intended for NEXUS inputs to BEAST 2.",
69 ),
70 mrbayes: bool = typer.Option(
71 False,
72 help="Add a MrBayes block containing model settings and age calibrations for witnesses to NEXUS output; this option is intended for inputs to MrBayes.",
73 ),
74 clock: ClockModel = typer.Option(
75 ClockModel.strict,
76 help="The clock model to use; this option is intended for inputs to MrBayes and BEAST 2. MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified.",
77 ),
78 ancestral_logger: AncestralLogger = typer.Option(
79 AncestralLogger.state,
80 help="The type of logger to use for ancestral state reconstruction data; this option is intended for inputs to BEAST 2. If \"state\", then only the reconstructed states at the root of each sampled tree will be logged. If \"sequence\", then each sampled tree's reconstructed states for all ancestors will be logged (WARNING: this will be memory-intensive!). If \"none\", then no ancestral states will be logged.",
81 ),
82 table: TableType = typer.Option(
83 TableType.matrix,
84 help="The type of table to use for CSV/TSV/Excel/PHYLIP output.\nIf \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option).\nIf \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option).\nIf \"similarity\", then the table will have rows and columns for witnesses, with the number or proportion of agreements between each pair in the corresponding cell (the --proportion flag can be used with this option).\nIf \"idf\", then the table will have rows and columns for witnesses, where each cell contains the sum or mean of inverse document frequency-weighted agreements between the corresponding pair of witnesses (the --proportion flag can be used with this option).\nIf \"mi\", then the table will have rows and columns for witnesses, where each cell contains the sum or mean of mutual information between the corresponding pair of witnesses over all variation units (the --proportion flag can be used with this option).\nIf \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option).\nIf \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.\nIf the output is a PHYLIP file, then the type of tabular output must be \"distance\" or \"similarity\"; otherwise, it will be ignored.",
85 ),
86 split_missing: SplitMissingType = typer.Option(
87 None,
88 help="Treat missing characters/variation units as having a contribution of 1 split over all states/readings.\nIf not specified, then missing data is ignored (i.e., all states are 0).\nIf \"uniform\", then the contribution of 1 is divided evenly over all substantive readings.\nIf \"proportional\", then the contribution of 1 is divided between the readings in proportion to their support among the witnesses that are not missing.\nNot applicable for non-tabular formats.",
89 ),
90 transform_matrix: TransformMatrixType = typer.Option(
91 None,
92 help="Transform the columns of a witness-to-witness matrix output (e.g., a tabular output of type \"distance\", \"similarity\", \"idf\", \"mi\", etc.).\nIf \"stddev\", then the mean and standard deviation of each column are calculated and each value in that column is replaced with the number of standard deviations it is from the mean.\nIf \"mad\", then the median and median absolute deviation (MAD) of each column are calculated and each value in that column is replaced with the number of MADs it is from the median.\nThe transformation is applied after the --split-missing and --proportion options are applied.",
93 ),
94 show_ext: bool = typer.Option(
95 False,
96 help="If set, each cell of a witness-to-witness matrix output (e.g., a tabular output of type \"distance\", \"similarity\", \"idf\", \"mi\", etc.) will display the cell's value, followed by the number of variation units where both witnesses are extant and have unambiguous readings.\n(For example, a cell containing 47/50 in a similarity table would indicate that the row and column witnesses agree at 47 of the 50 units where they both have readings.)",
97 ),
98 seed: int = typer.Option(
99 None,
100 help="Seed for random number generation (used for setting default initial values of transcriptional rate parameters for BEAST 2 XML output); if not specified, then the default seeding of the numpy.random.default_rng class will be used.",
101 ),
102 verbose: bool = typer.Option(False, help="Enable verbose logging (mostly for debugging purposes)."),
103 version: bool = typer.Option(
104 False,
105 callback=version_callback,
106 is_eager=True,
107 help="Print the current version.",
108 ),
109 format: Format = typer.Option(None, case_sensitive=False, help="The output format."),
110 dates_file: Path = typer.Option(
111 None,
112 exists=True,
113 file_okay=True,
114 dir_okay=False,
115 writable=False,
116 readable=True,
117 resolve_path=True,
118 help="CSV file containing witness IDs in the first column and minimum and maximum dates for those witnesses in the next two columns. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.",
119 ),
120 input: Path = typer.Argument(
121 ...,
122 exists=True,
123 file_okay=True,
124 dir_okay=False,
125 writable=False,
126 readable=True,
127 resolve_path=True,
128 help="Input TEI XML collation file to convert.",
129 ),
130 output: Path = typer.Argument(
131 ...,
132 exists=False,
133 file_okay=True,
134 dir_okay=False,
135 writable=True,
136 readable=False,
137 resolve_path=True,
138 help="Output for converted collation. If --format is not specified, then the format will be derived from the extension of this file.",
139 ),
140):
141 # Make sure the input is an XML file:
142 if input.suffix.lower() != ".xml":
143 print("Error opening input file: The input file is not an XML file. Make sure the input file type is .xml.")
144 exit(1)
145 # If it is, then try to parse it:
146 xml = None
147 try:
148 parser = et.XMLParser(remove_comments=True)
149 xml = et.parse(input, parser=parser)
150 except Exception as err:
151 print(f"Error opening input file: {err}")
152 exit(1)
153 # Make sure the fragmentary_threshold input, if specified, is between 0 and 1:
154 if fragmentary_threshold is not None and (fragmentary_threshold < 0.0 or fragmentary_threshold > 1.0):
155 print(
156 "Error: the fragmentary variation unit proportion threshold is %f. It must be a value in [0, 1]."
157 % fragmentary_threshold
158 )
159 exit(1)
160 # Make sure the dates_file input, if specified, is a CSV file:
161 if dates_file is not None and dates_file.suffix.lower() != ".csv":
162 print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")
163 exit(1)
164 coll = Collation(
165 xml,
166 suffixes,
167 trivial_reading_types,
168 missing_reading_types,
169 fill_correctors,
170 fragmentary_threshold,
171 dates_file,
172 verbose,
173 )
174 coll.to_file(
175 output,
176 format=format,
177 drop_constant=drop_constant,
178 char_state_labels=labels,
179 frequency=frequency,
180 ambiguous_as_missing=ambiguous_as_missing,
181 proportion=proportion,
182 calibrate_dates=calibrate_dates,
183 mrbayes=mrbayes,
184 clock_model=clock,
185 ancestral_logger=ancestral_logger,
186 table_type=table,
187 split_missing=split_missing,
188 transform_matrix=transform_matrix,
189 show_ext=show_ext,
190 seed=seed,
191 )