#!/usr/bin/env python3

"""
BLAST Analysis Pipeline Script
Purpose: Automate BLASTP searches for multiple protein FASTA files
Author: George Kargas
Date: December 17, 2024

This script processes multiple protein FASTA (.faa) files through BLASTP,
comparing them against a specified BLAST database (like nr) to identify
species-specific genes.
"""

import os
import sys
import subprocess
import argparse
from datetime import datetime
import logging

def setup_logger():
    """
    Configure logging to both file and console output.
    
    Returns:
        logger: Configured logging object that writes to both blast_analysis.log
               and the console with timestamps and log levels.
    """
    logging.basicConfig(
        level=logging.INFO,  # Set logging level to INFO to capture all important messages
        format='%(asctime)s - %(levelname)s - %(message)s',  # Include timestamp and log level
        handlers=[
            logging.FileHandler('blast_analysis.log'),  # Save logs to file
            logging.StreamHandler(sys.stdout)  # Also print to console
        ]
    )
    return logging.getLogger(__name__)

def check_input_files(input_dir):
    """
    Verify that the input directory contains .faa files for processing.
    
    Args:
        input_dir (str): Path to directory containing input files
        
    Returns:
        list: List of .faa files found in the directory
        
    Raises:
        ValueError: If no .faa files are found in the directory
    """
    # List comprehension to find all .faa files in directory
    faa_files = [f for f in os.listdir(input_dir) if f.endswith('.faa')]
    
    # Raise error if no .faa files found
    if not faa_files:
        raise ValueError(f"No .faa files found in {input_dir}")
    return faa_files

def run_blastp(input_file, db_path, output_dir, logger):
    """
    Execute BLASTP on a single input file.
    
    Args:
        input_file (str): Path to input .faa file
        db_path (str): Path to BLAST database
        output_dir (str): Directory for output files
        logger: Logging object for status messages
    
    Returns:
        bool: True if BLASTP completes successfully, False otherwise
        
    Notes:
        - Uses BLAST+ output format 6 (tabular)
        - E-value cutoff: 1e-5
        - Runs with 8 threads for performance
    """
    try:
        # Construct output filename from input filename
        output_file = os.path.join(
            output_dir, 
            f"{os.path.splitext(os.path.basename(input_file))[0]}_blast.txt"
        )
        
        # Construct BLASTP command with parameters
        cmd = [
            'blastp',  # Use protein BLAST
            '-query', input_file,  # Input file
            '-db', db_path,  # BLAST database path
            '-out', output_file,  # Output file
            '-evalue', '1e-5',  # E-value threshold
            # Custom tabular output format with taxonomy information
            '-outfmt', '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sscinames',
            '-num_threads', '8'  # Use 8 CPU threads
        ]
        
        # Log start of BLAST process
        logger.info(f"Running BLASTP on {os.path.basename(input_file)}")
        
        # Execute BLASTP command
        subprocess.run(cmd, check=True)
        
        # Log successful completion
        logger.info(f"Successfully processed {os.path.basename(input_file)}")
        return True
        
    except subprocess.CalledProcessError as e:
        # Log any errors that occur during BLAST
        logger.error(f"Error running BLASTP on {input_file}: {str(e)}")
        return False

def main():
    """
    Main function to coordinate the BLAST analysis pipeline.
    
    Workflow:
    1. Parse command line arguments
    2. Set up logging
    3. Create output directory
    4. Process each input file
    5. Provide summary of results
    """
    # Set up command line argument parser
    parser = argparse.ArgumentParser(description='Run BLASTP analysis on protein sequences')
    parser.add_argument('--input', required=True, help='Directory containing .faa files')
    parser.add_argument('--db', required=True, help='Path to BLAST database')
    parser.add_argument('--output', required=True, help='Output directory for BLAST results')
    
    # Parse command line arguments
    args = parser.parse_args()
    
    # Initialize logging
    logger = setup_logger()
    logger.info("Starting BLAST analysis pipeline")
    
    try:
        # Create output directory if it doesn't exist
        os.makedirs(args.output, exist_ok=True)
        
        # Verify input files exist
        input_files = check_input_files(args.input)
        logger.info(f"Found {len(input_files)} .faa files to process")
        
        # Process each file and track success/failure
        successful = 0
        failed = 0
        
        # Iterate through each input file
        for faa_file in input_files:
            input_path = os.path.join(args.input, faa_file)
            if run_blastp(input_path, args.db, args.output, logger):
                successful += 1
            else:
                failed += 1
        
        # Log final summary
        logger.info("BLAST Analysis Complete!")
        logger.info(f"Successfully processed: {successful} files")
        if failed > 0:
            logger.warning(f"Failed to process: {failed} files")
            
    except Exception as e:
        # Log any unexpected errors and exit with error code
        logger.error(f"An error occurred: {str(e)}")
        sys.exit(1)

# Standard Python idiom to call main() if script is run directly
if __name__ == "__main__":
    main()