Source code for omnigenbench.auto.auto_bench.auto_bench_cli

# -*- coding: utf-8 -*-
# file: auto_bench_cli.py
# time: 19:18 05/02/2025
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
# Homepage: https://yangheng95.github.io
# github: https://github.com/yangheng95
# huggingface: https://huggingface.co/yangheng
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# Copyright (C) 2019-2025. All Rights Reserved.

import argparse
import os
import platform
import sys
import time

from typing import Optional

# Handle both relative and absolute imports
try:
    from ..auto_bench.auto_bench import AutoBench
    from ...src.misc.utils import fprint
except ImportError:
    # Fallback for direct execution
    import sys
    from pathlib import Path

    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
    from omnigenbench.auto.auto_bench.auto_bench import AutoBench
    from omnigenbench.src.misc.utils import fprint



[docs]
def bench_command(args: Optional[list] = None):
    """
    This function parses command-line arguments, initializes the AutoBench,
    and runs the evaluation.
    """

    parser = create_parser()
    parsed_args = parser.parse_args(args)

    model_path = parsed_args.model
    fprint(f"\n>> Starting evaluation for model: {model_path}")

    # Special handling for multimolecule models
    if "multimolecule" in model_path:
        from multimolecule import RnaTokenizer, AutoModelForTokenPrediction

        tokenizer = RnaTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenPrediction.from_pretrained(
            model_path, trust_remote_code=True
        ).base_model
    else:
        tokenizer = parsed_args.tokenizer
        model = model_path

    # Initialize benchmark
    autobench = AutoBench(
        benchmark=parsed_args.benchmark,
        config_or_model=model,
        tokenizer=tokenizer,
        overwrite=parsed_args.overwrite,
        trainer=parsed_args.trainer,
    )

    # Run evaluation
    autobench.run(**vars(parsed_args))




[docs]
def create_parser() -> argparse.ArgumentParser:
    """
    Creates the argument parser for the benchmark CLI.

    Returns:
        An `argparse.ArgumentParser` instance.
    """
    parser = argparse.ArgumentParser(
        description="Genomic Foundation Model Benchmark Suite (Single Model)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    # Required argument
    parser.add_argument(
        "-b",
        "--benchmark",
        type=str,
        default="RGB",
        # choices=["RGB", "PGB", "GUE", "GB", "BEACON"],
        help="Path to the BEACON benchmark root directory.",
    )
    parser.add_argument(
        "-t",
        "--tokenizer",
        type=str,
        default=None,
        help="Path to the tokenizer to use (HF tokenizer ID or local path).",
    )

    parser.add_argument(
        "-m",
        "--model",
        type=str,
        required=True,
        help="Path to the model to evaluate (HF model ID or local path).",
    )

    # Optional arguments
    parser.add_argument(
        "--overwrite",
        action="store_true",
        help="Overwrite existing bench results, otherwise resume from benchmark checkpoint.",
    )
    parser.add_argument(
        "--bs_scale",
        type=int,
        default=1,
        help="Batch size scale factor. To increase GPU memory utilization, set to 2 or 4, etc.",
    )
    parser.add_argument(
        "--trainer",
        type=str,
        default="accelerate",
        choices=["native", "accelerate", "hf_trainer"],
        help="Trainer to use for training. \n"
        "Use 'accelerate' for distributed training. Set to false to disable. "
        "You can use 'accelerate config' to customize behavior.\n"
        "Use 'hf_trainer' for Hugging Face Trainer. \n"
        "Set to 'native' to use native PyTorch training loop.\n",
    )
    parser.add_argument(
        "--autocast",
        type=str,
        default="fp16",
        choices=["fp16", "fp32", "bf16", "fp8", "no"],
        help="Automatic mixed precision training mode.",
    )
    parser.add_argument(
        "--lora",
        action="store_true",
        help="Use LoRA fine-tuning if this flag is set.",
    )
    return parser




[docs]
def run_bench():
    """
    This function sets up logging, constructs the command to execute
    (potentially with `accelerate launch`), and runs it.
    """
    fprint("Running benchmark, this may take a while, please be patient...")
    fprint("You can find the logs in the 'autobench_logs' directory.")
    fprint("You can find the metrics in the 'autobench_evaluations' directory.")
    fprint(
        "If you don't intend to use accelerate, please add '--trainer native' to the command."
    )
    fprint(
        "If you want to alter accelerate's behavior, please refer to 'accelerate config' command."
    )
    fprint("If you encounter any issues, please report them on the GitHub repository.")
    os.makedirs("autobench_logs", exist_ok=True)
    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    log_file = f"autobench_logs/AutoBench-{time_str}.log"
    from pathlib import Path

    try:
        mixed_precision = sys.argv[sys.argv.index("--autocast") + 1].lower()
    except ValueError:
        mixed_precision = "fp16"
    file_path = Path(__file__).resolve()
    if (
        "--trainer" in sys.argv
        and sys.argv[sys.argv.index("--trainer") + 1].lower() == "native"
    ):
        cmd_base = f'python "{file_path}" ' + " ".join(sys.argv[1:])
    else:
        cmd_base = (
            f'accelerate launch --mixed_precision "{mixed_precision}" "{file_path}" '
            + " ".join(sys.argv[1:])
        )

    # Use platform-specific tee commands:
    if platform.system() == "Windows":
        # On Windows, use PowerShell's tee-object.
        # The command below launches PowerShell and passes the tee-object command.
        # try:
        #     cmd = f"{cmd_base} 2>&1 | powershell -Command Get-Content {log_file} -Wait"
        # except Exception as e:
        #     fprint(f"The log file cannot be saved due to Error: {e}")
        #     fprint(
        #         "If commands not allowed in PowerShell, "
        #         "please run 'Set-ExecutionPolicy RemoteSigned' in PowerShell with Admin."
        #     )
        cmd = f"{cmd_base} 2>&1"
    else:
        # On Unix-like systems, use the standard tee command.
        cmd = f"{cmd_base} 2>&1 | tee '{log_file}'"

    # Execute the command.
    sys.exit(os.system(cmd))



if __name__ == "__main__":
    bench_command()