Example Script
The file run_snpio.py is a ready-to-use template that walks through a
complete SNPio workflow:
Load genotype data (VCF, Structure, Phylip, or Genepop + popmap)
Filter and clean the data
Run desired population-genetic analyses
Generate plots and an interactive MultiQC report
Run it using the snpio terminal command once SNPio and its dependencies are installed:
$ snpio \
--input data/example.vcf.gz \
--popmap data/popmap.txt \
--prefix results/example_run
Command-line options
run_snpio.py uses argparse for a clean CLI:
Option |
Description |
|---|---|
|
Path to a VCF / PHYLIP / STRUCTURE file. |
|
Two-column file mapping sample ID → population. |
|
Output prefix; results go to |
|
Plot format: |
|
Verbose logging. |
|
Extra diagnostics for troubleshooting. |
Outputs
The script produces:
Filtered genotype data (HDF5 + flat files)
Population-genetic statistics (CSV / JSON)
Plots (summary, PCA, distances, outliers, D-stats)
Interactive MultiQC report in
<prefix>_output/multiqc/
Source code
Below is the full example script bundled with SNPio. Feel free to copy, adapt, or mine individual steps for your own pipelines.
1#!/usr/bin/env python3
2
3import argparse
4from pathlib import Path
5
6from snpio import NRemover2, PopGenStatistics, SNPioMultiQC, VCFReader
7
8"""
9run_snpio.py
10
11A helper script to run SNPio programmatically from within Docker or CLI.
12
13Usage:
14 python run_snpio.py \
15 --input /app/data/0_original_alignments/example.vcf \
16 --popmap /app/data/1_popmaps/example_popmap.txt \
17 --prefix /app/results/snpio \
18 --verbose \
19 --debug \
20 --plot-format <png|pdf|svg>
21"""
22
23
24def version():
25 from snpio import __version__
26
27 return str(__version__)
28
29
30def validate_file(path: str, name: str) -> None:
31 pth = Path(path)
32 if not pth.exists() or not pth.is_file():
33 print(f"ERROR: {name} file not found at: {path}")
34 raise FileNotFoundError(f"{name} file not found: {path}")
35
36
37def parse_args():
38 parser = argparse.ArgumentParser(
39 prog="SNPio",
40 description="Run SNPio with specified input, popmap, and output prefix.",
41 )
42 parser.add_argument(
43 "--input",
44 type=str,
45 required=True,
46 help="Path to input file (VCF, PHYLIP, or STRUCTURE format).",
47 )
48 parser.add_argument(
49 "--popmap",
50 type=str,
51 required=True,
52 help="Path to popmap file mapping samples to populations. Format: <sample>\t<population>",
53 )
54 parser.add_argument(
55 "--prefix",
56 type=str,
57 required=True,
58 help="Output prefix for results (output files will be saved as <prefix>_output/*)",
59 )
60 parser.add_argument(
61 "--verbose",
62 action="store_true",
63 help="Enable verbose logging. Includes additional logging information during processing.",
64 )
65 parser.add_argument(
66 "--debug",
67 action="store_true",
68 help="Enable debug mode. Includes additional logging and checks. This may slow down processing.",
69 )
70 parser.add_argument(
71 "--plot-format",
72 type=str,
73 default="png",
74 choices=["png", "pdf", "svg"],
75 help="Format for output plots. Options: png, pdf, svg (default: png)",
76 )
77
78 parser.add_argument(
79 "--version",
80 default=False,
81 required=False,
82 action="store_true",
83 help="Show the version of SNPio and exit.",
84 )
85
86 args = parser.parse_args()
87
88 if args.version:
89 print(f"SNPio version {version()}")
90 exit(0)
91
92 return args
93
94
95def main():
96 args = parse_args()
97
98 # Validate paths
99 validate_file(args.input, "Input")
100 validate_file(args.popmap, "Popmap")
101
102 print(f"🧬 Running SNPio version {version()} with the following arguments:")
103 print(f" 📥 Input file: {args.input}")
104 print(f" 🧾 Popmap file: {args.popmap}")
105 print(f" 📁 Output prefix: {args.prefix}")
106 print(f" 🖼️ Plot format: {args.plot_format}")
107 print(f" 🔍 Verbose: {args.verbose}")
108 print(f" 🐛 Debug: {args.debug}")
109 print()
110
111 genotype_data = VCFReader(
112 filename=args.input,
113 popmapfile=args.popmap,
114 force_popmap=True,
115 chunk_size=5000,
116 include_pops=["EA", "GU", "TT", "ON", "OG"],
117 prefix=args.prefix,
118 plot_format=args.plot_format,
119 verbose=args.verbose,
120 debug=args.debug,
121 # allele_encoding={"0": "A", "1": "C", "2": "G", "3": "T", "-9": "N"},
122 )
123
124 # Generate missingness reports before filtering
125 genotype_data.missingness_reports(prefix=args.prefix)
126
127 nrm = NRemover2(genotype_data)
128
129 nrm.search_thresholds(
130 thresholds=[0.25, 0.5, 0.75],
131 maf_thresholds=[0.01, 0.05],
132 mac_thresholds=[2, 3],
133 filter_order=[
134 "filter_missing_sample",
135 "filter_missing",
136 "filter_missing_pop",
137 "filter_monomorphic",
138 "filter_singletons",
139 "filter_biallelic",
140 "filter_mac",
141 "filter_maf",
142 ],
143 )
144
145 gd_filt = (
146 nrm.filter_biallelic(exclude_heterozygous=True)
147 .filter_missing(0.75)
148 .filter_missing_pop(0.75)
149 .filter_singletons(exclude_heterozygous=True)
150 .filter_missing_sample(0.8)
151 .resolve()
152 )
153
154 nrm.plot_sankey_filtering_report()
155 gd_filt.missingness_reports(gd_filt.prefix)
156
157 pgs = PopGenStatistics(gd_filt, verbose=args.verbose, debug=args.debug)
158
159 allele_summary_stats, summary_stats = pgs.summary_statistics(
160 fst_method="observed", n_reps=1000, n_jobs=8
161 )
162 fst_dist = pgs.fst_distance(
163 method="permutation", n_reps=1000, n_jobs=8, palette="magma"
164 )
165 fst_dist = pgs.fst_distance(
166 method="bootstrap", n_reps=1000, n_jobs=8, palette="magma"
167 )
168
169 neis_dist_boot = pgs.neis_genetic_distance(
170 method="bootstrap", n_reps=1000, n_jobs=8
171 )
172
173 neis_dist_perm = pgs.neis_genetic_distance(
174 method="permutation", n_reps=1000, n_jobs=8
175 )
176
177 fst_perm = pgs.detect_fst_outliers(
178 n_permutations=100,
179 correction_method="fdr_bh",
180 use_dbscan=False,
181 n_jobs=8,
182 min_samples=5,
183 seed=42,
184 )
185
186 fst_dbscan = pgs.detect_fst_outliers(
187 n_permutations=1000,
188 correction_method="fdr_bh",
189 use_dbscan=True,
190 n_jobs=8,
191 min_samples=5,
192 seed=42,
193 )
194
195 dstats = pgs.calculate_d_statistics(
196 method="patterson",
197 population1="EA",
198 population2="GU",
199 population3="TT",
200 outgroup="ON",
201 num_bootstraps=1000,
202 individual_selection="random",
203 max_individuals_per_pop=3,
204 seed=42,
205 )
206
207 dstats_partitioned = pgs.calculate_d_statistics(
208 method="partitioned",
209 population1="EA",
210 population2="GU",
211 population3="TT",
212 population4="ON",
213 outgroup="OG",
214 num_bootstraps=1000,
215 individual_selection="random",
216 max_individuals_per_pop=3,
217 seed=42,
218 )
219
220 dstats_dfoil = pgs.calculate_d_statistics(
221 method="dfoil",
222 population1="EA",
223 population2="GU",
224 population3="TT",
225 population4="ON",
226 outgroup="OG",
227 num_bootstraps=1000,
228 individual_selection="random",
229 max_individuals_per_pop=3,
230 seed=42,
231 )
232
233 # Run PCA
234 pgs.pca()
235
236 # Build MultiQC report
237 print("📊 Building MultiQC report...")
238 SNPioMultiQC.build(
239 prefix="Example Report",
240 output_dir=f"{args.prefix}_output/multiqc",
241 overwrite=True,
242 )
243
244
245if __name__ == "__main__":
246 main()
Tip
For large analyses, adjust the chunk_size and n_jobs parameters to match your compute resources, and consider running the filtering and statistics steps in a workflow manager (e.g. Snakemake or Nextflow).