Source code for preprocessing_pipeline.download

import subprocess
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

[docs] def download_file(url: str, out_path: Path) -> None: """ Download a file from a given URL and save it locally using the `wget` command. Parameters ---------- url : str The URL to download from. out_path : pathlib.Path The local file path where the downloaded file will be saved. Returns ------- None If the file already exists, no download is performed. Otherwise, runs a shell command with `wget` to fetch the file. """ if out_path.exists(): logger.info(f"File already exists: {out_path}. Skipping download.") return cmd = f"wget -O {out_path} {url}" logger.info(f"Downloading {url} -> {out_path}") subprocess.run(cmd, shell=True, check=True)
[docs] def unzip_gz(file_path: Path, remove_input: bool = False) -> None: """ Decompress a GZIP file (i.e., `.gz`) in-place using `gzip -d`. Parameters ---------- file_path : pathlib.Path Path to the `.gz` file to be decompressed. remove_input : bool, optional If True, delete the original `.gz` file after successful decompression. Default is False. Returns ------- None Decompresses the file in-place, leaving a new file without the `.gz` extension. """ cmd = f"gzip -d {file_path}" logger.info(f"Decompressing {file_path}") subprocess.run(cmd, shell=True, check=True) if remove_input: gz_file = file_path if gz_file.exists(): gz_file.unlink()
[docs] def resolve_genome_urls( species: str, assembly: str, gtf_url: str, chrom_sizes_url: str, fasta_url: str ) -> tuple[str, str, str]: """ Resolve final URLs for GTF, chromosome sizes, and FASTA files based on species and assembly. 1. If user provides URLs (gtf_url, chrom_sizes_url, fasta_url), use them directly. 2. If not, attempt to use known defaults for recognized combos (e.g. mouse/mm10, human/hg38). 3. If the combo is unrecognized and the user hasn't provided all URLs, raise an error. Parameters ---------- species : str Species name (e.g. "mouse", "human"). assembly : str Genome assembly (e.g. "mm10", "hg38"). gtf_url : str or None A user-provided URL for the GTF file, or None to try defaults. chrom_sizes_url : str or None A user-provided URL for the chromosome sizes file, or None to try defaults. fasta_url : str or None A user-provided URL for the FASTA file, or None to try defaults. Returns ------- (final_gtf_url, final_chrom_sizes_url, final_fasta_url) : tuple of str The resolved URLs for GTF, chromosome sizes, and FASTA. These may come from user-provided values or known defaults if recognized. If no defaults exist and user inputs are missing, raises ValueError. """ final_gtf_url = gtf_url final_chrom_sizes_url = chrom_sizes_url final_fasta_url = fasta_url # Known defaults for mouse mm10 if species.lower() == "mouse" and assembly.lower() == "mm10": if final_gtf_url is None: final_gtf_url = ( "https://ftp.ebi.ac.uk/pub/databases/gencode/" "Gencode_mouse/release_M18/gencode.vM18.basic.annotation.gtf.gz" ) if final_chrom_sizes_url is None: final_chrom_sizes_url = ( "https://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes" ) if final_fasta_url is None: final_fasta_url = ( "https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz" ) # Known defaults for human hg38 elif species.lower() == "human" and assembly.lower() == "hg38": if final_gtf_url is None: final_gtf_url = ( "https://ftp.ebi.ac.uk/pub/databases/gencode/" "Gencode_human/release_47/gencode.v47.primary_assembly.basic.annotation.gtf.gz" ) if final_chrom_sizes_url is None: final_chrom_sizes_url = ( "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes" ) if final_fasta_url is None: final_fasta_url = ( "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.fa.gz" ) else: # Unknown assembly => user must provide or raise an error if any is None if final_gtf_url is None or final_chrom_sizes_url is None or final_fasta_url is None: raise ValueError( f"Unknown assembly '{assembly}' for species='{species}'. " "Please provide gtf_url, chrom_sizes_url, and fasta_url in config." ) return (final_gtf_url, final_chrom_sizes_url, final_fasta_url)
[docs] def download_genome_references( genome_dir: Path, species: str, assembly: str, gtf_url: str = None, chrom_sizes_url: str = None, fasta_url: str = None ) -> None: """ Download reference genome files (GTF, chrom.sizes, FASTA) for a given species and assembly. 1. Resolve the URLs from user input or known defaults for recognized combos (mouse/mm10, human/hg38). 2. Download each file if not already present. 3. Decompress .gz files for GTF and FASTA. Parameters ---------- genome_dir : pathlib.Path Directory where the reference files will be downloaded. species : str Species name (e.g. "mouse", "human"). assembly : str Genome assembly version (e.g. "mm10", "hg38"). gtf_url : str, optional GTF file URL to override defaults. If None, use known default (if available). chrom_sizes_url : str, optional Chromosome sizes file URL to override defaults. If None, use known default (if available). fasta_url : str, optional FASTA file URL to override defaults. If None, use known default (if available). Returns ------- None Files are downloaded into `genome_dir`. If a file already exists, no new download occurs. """ genome_dir.mkdir(parents=True, exist_ok=True) # 1) Resolve the final URLs based on species/assembly + user overrides final_gtf_url, final_chrom_sizes_url, final_fasta_url = resolve_genome_urls( species, assembly, gtf_url, chrom_sizes_url, fasta_url ) logger.info( f"Using genome references for species='{species}', assembly='{assembly}'.\n" f"GTF: {final_gtf_url}\n" f"Chrom.sizes: {final_chrom_sizes_url}\n" f"FASTA: {final_fasta_url}" ) # Decide on local filenames gtf_gz = genome_dir / "annotation.gtf.gz" gtf_final = genome_dir / "annotation.gtf" chrom_sizes_path = genome_dir / f"{assembly}.chrom.sizes" fasta_gz = genome_dir / f"{assembly}.fa.gz" fasta_final = genome_dir / f"{assembly}.fa" # 2) GTF if not gtf_final.exists(): download_file(final_gtf_url, gtf_gz) unzip_gz(gtf_gz, remove_input=True) # 3) chrom sizes if not chrom_sizes_path.exists(): download_file(final_chrom_sizes_url, chrom_sizes_path) # 4) FASTA if not fasta_final.exists(): download_file(final_fasta_url, fasta_gz) unzip_gz(fasta_gz, remove_input=True) logger.info(f"Reference files are ready in {genome_dir}")