Example Script

The file run_snpio.py is a ready-to-use template that walks through a complete SNPio workflow:

  • Load genotype data (VCF, Structure, Phylip, or Genepop + popmap)

  • Filter and clean the data

  • Run desired population-genetic analyses

  • Generate plots and an interactive MultiQC report

Run it using the snpio terminal command once SNPio and its dependencies are installed:

$ snpio \
   --input   data/example.vcf.gz \
   --popmap  data/popmap.txt \
   --prefix  results/example_run

Command-line options

run_snpio.py uses argparse for a clean CLI:

Option

Description

--input

Path to a VCF / PHYLIP / STRUCTURE file.

--popmap

Two-column file mapping sample ID → population.

--prefix

Output prefix; results go to <prefix>_output/.

--plot-format

Plot format: png | pdf | svg | jpeg (default: png).

--verbose

Verbose logging.

--debug

Extra diagnostics for troubleshooting.

Outputs

The script produces:

  • Filtered genotype data (HDF5 + flat files)

  • Population-genetic statistics (CSV / JSON)

  • Plots (summary, PCA, distances, outliers, D-stats)

  • Interactive MultiQC report in <prefix>_output/multiqc/

Source code

Below is the full example script bundled with SNPio. Feel free to copy, adapt, or mine individual steps for your own pipelines.

  1#!/usr/bin/env python3
  2
  3import argparse
  4from pathlib import Path
  5
  6from snpio import NRemover2, PopGenStatistics, SNPioMultiQC, VCFReader
  7
  8"""
  9run_snpio.py
 10
 11A helper script to run SNPio programmatically from within Docker or CLI.
 12
 13Usage:
 14   python run_snpio.py \
 15      --input /app/data/0_original_alignments/example.vcf \
 16      --popmap /app/data/1_popmaps/example_popmap.txt \
 17      --prefix /app/results/snpio \
 18      --verbose \
 19      --debug \
 20      --plot-format <png|pdf|svg>
 21"""
 22
 23
 24def version():
 25   from snpio import __version__
 26
 27   return str(__version__)
 28
 29
 30def validate_file(path: str, name: str) -> None:
 31   pth = Path(path)
 32   if not pth.exists() or not pth.is_file():
 33      print(f"ERROR: {name} file not found at: {path}")
 34      raise FileNotFoundError(f"{name} file not found: {path}")
 35
 36
 37def parse_args():
 38   parser = argparse.ArgumentParser(
 39      prog="SNPio",
 40      description="Run SNPio with specified input, popmap, and output prefix.",
 41   )
 42   parser.add_argument(
 43      "--input",
 44      type=str,
 45      required=True,
 46      help="Path to input file (VCF, PHYLIP, or STRUCTURE format).",
 47   )
 48   parser.add_argument(
 49      "--popmap",
 50      type=str,
 51      required=True,
 52      help="Path to popmap file mapping samples to populations. Format: <sample>\t<population>",
 53   )
 54   parser.add_argument(
 55      "--prefix",
 56      type=str,
 57      required=True,
 58      help="Output prefix for results (output files will be saved as <prefix>_output/*)",
 59   )
 60   parser.add_argument(
 61      "--verbose",
 62      action="store_true",
 63      help="Enable verbose logging. Includes additional logging information during processing.",
 64   )
 65   parser.add_argument(
 66      "--debug",
 67      action="store_true",
 68      help="Enable debug mode. Includes additional logging and checks. This may slow down processing.",
 69   )
 70   parser.add_argument(
 71      "--plot-format",
 72      type=str,
 73      default="png",
 74      choices=["png", "pdf", "svg"],
 75      help="Format for output plots. Options: png, pdf, svg (default: png)",
 76   )
 77
 78   parser.add_argument(
 79      "--version",
 80      default=False,
 81      required=False,
 82      action="store_true",
 83      help="Show the version of SNPio and exit.",
 84   )
 85
 86   args = parser.parse_args()
 87
 88   if args.version:
 89      print(f"SNPio version {version()}")
 90      exit(0)
 91
 92   return args
 93
 94
 95def main():
 96   args = parse_args()
 97
 98   # Validate paths
 99   validate_file(args.input, "Input")
100   validate_file(args.popmap, "Popmap")
101
102   print(f"🧬 Running SNPio version {version()} with the following arguments:")
103   print(f"  📥 Input file:     {args.input}")
104   print(f"  🧾 Popmap file:    {args.popmap}")
105   print(f"  📁 Output prefix:  {args.prefix}")
106   print(f"  🖼️ Plot format:     {args.plot_format}")
107   print(f"  🔍 Verbose:         {args.verbose}")
108   print(f"  🐛 Debug:           {args.debug}")
109   print()
110
111   genotype_data = VCFReader(
112      filename=args.input,
113      popmapfile=args.popmap,
114      force_popmap=True,
115      chunk_size=5000,
116      include_pops=["EA", "GU", "TT", "ON", "OG"],
117      prefix=args.prefix,
118      plot_format=args.plot_format,
119      verbose=args.verbose,
120      debug=args.debug,
121      # allele_encoding={"0": "A", "1": "C", "2": "G", "3": "T", "-9": "N"},
122   )
123
124   # Generate missingness reports before filtering
125   genotype_data.missingness_reports(prefix=args.prefix)
126
127   nrm = NRemover2(genotype_data)
128
129   nrm.search_thresholds(
130      thresholds=[0.25, 0.5, 0.75],
131      maf_thresholds=[0.01, 0.05],
132      mac_thresholds=[2, 3],
133      filter_order=[
134            "filter_missing_sample",
135            "filter_missing",
136            "filter_missing_pop",
137            "filter_monomorphic",
138            "filter_singletons",
139            "filter_biallelic",
140            "filter_mac",
141            "filter_maf",
142      ],
143   )
144
145   gd_filt = (
146      nrm.filter_biallelic(exclude_heterozygous=True)
147      .filter_missing(0.75)
148      .filter_missing_pop(0.75)
149      .filter_singletons(exclude_heterozygous=True)
150      .filter_missing_sample(0.8)
151      .resolve()
152   )
153
154   nrm.plot_sankey_filtering_report()
155   gd_filt.missingness_reports(gd_filt.prefix)
156
157   pgs = PopGenStatistics(gd_filt, verbose=args.verbose, debug=args.debug)
158
159   allele_summary_stats, summary_stats = pgs.summary_statistics(
160      fst_method="observed", n_reps=1000, n_jobs=8
161   )
162   fst_dist = pgs.fst_distance(
163      method="permutation", n_reps=1000, n_jobs=8, palette="magma"
164   )
165   fst_dist = pgs.fst_distance(
166      method="bootstrap", n_reps=1000, n_jobs=8, palette="magma"
167   )
168
169   neis_dist_boot = pgs.neis_genetic_distance(
170      method="bootstrap", n_reps=1000, n_jobs=8
171   )
172
173   neis_dist_perm = pgs.neis_genetic_distance(
174      method="permutation", n_reps=1000, n_jobs=8
175   )
176
177   fst_perm = pgs.detect_fst_outliers(
178      n_permutations=100,
179      correction_method="fdr_bh",
180      use_dbscan=False,
181      n_jobs=8,
182      min_samples=5,
183      seed=42,
184   )
185
186   fst_dbscan = pgs.detect_fst_outliers(
187      n_permutations=1000,
188      correction_method="fdr_bh",
189      use_dbscan=True,
190      n_jobs=8,
191      min_samples=5,
192      seed=42,
193   )
194
195   dstats = pgs.calculate_d_statistics(
196      method="patterson",
197      population1="EA",
198      population2="GU",
199      population3="TT",
200      outgroup="ON",
201      num_bootstraps=1000,
202      individual_selection="random",
203      max_individuals_per_pop=3,
204      seed=42,
205   )
206
207   dstats_partitioned = pgs.calculate_d_statistics(
208      method="partitioned",
209      population1="EA",
210      population2="GU",
211      population3="TT",
212      population4="ON",
213      outgroup="OG",
214      num_bootstraps=1000,
215      individual_selection="random",
216      max_individuals_per_pop=3,
217      seed=42,
218   )
219
220   dstats_dfoil = pgs.calculate_d_statistics(
221      method="dfoil",
222      population1="EA",
223      population2="GU",
224      population3="TT",
225      population4="ON",
226      outgroup="OG",
227      num_bootstraps=1000,
228      individual_selection="random",
229      max_individuals_per_pop=3,
230      seed=42,
231   )
232
233   # Run PCA
234   pgs.pca()
235
236   # Build MultiQC report
237   print("📊 Building MultiQC report...")
238   SNPioMultiQC.build(
239      prefix="Example Report",
240      output_dir=f"{args.prefix}_output/multiqc",
241      overwrite=True,
242   )
243
244
245if __name__ == "__main__":
246   main()

Tip

For large analyses, adjust the chunk_size and n_jobs parameters to match your compute resources, and consider running the filtering and statistics steps in a workflow manager (e.g. Snakemake or Nextflow).