Source code for omnigenbench.cli.commands.bench.bench_cli

# -*- coding: utf-8 -*-
# file: auto_bench_cli.py
# time: 21:06 31/01/2025
# author: YANG, HENG <hy345@exeter.ac.uk> (Yang Heng)
# Homepage: https://yangheng95.github.io
# github: https://github.com/yangheng95
# huggingface: https://huggingface.co/yangheng
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# Copyright (C) 2019-2025. All Rights Reserved.
import argparse
import os
import platform
import sys
import time
from pathlib import Path

from ....auto.auto_bench.auto_bench import AutoBench
from ....src.misc.utils import fprint
from ..base import BaseCommand



[docs]
class BenchCommand(BaseCommand):
    """
    This class provides a CLI interface for the AutoBench functionality, allowing users
    to easily run comprehensive evaluations of genomic models across multiple benchmarks.
    It supports various benchmarks, models, and training configurations.

    Attributes:
        benchmarks (list): List of available benchmarks (RGB, PGB, GUE, GB, BEACON)
        trainers (list): List of available trainers (native, accelerate, hf_trainer)

    Example:
        >>> # Run basic benchmark
        >>> python -m omnigenbench.cli autobench --model "model_name" --benchmark "RGB"
        >>> # Run with custom settings
        >>> python -m omnigenbench.cli autobench
        ...     --model "model_name"
        ...     --benchmark "RGB"
        ...     --trainer "accelerate"
        ...     --bs_scale 2
        ...     --overwrite True
    """


[docs]
    @classmethod
    def register_command(cls, subparsers):
        """
        This method sets up the command-line interface for the autobench functionality,
        including all necessary arguments and their descriptions.

        Args:
            subparsers: The subparsers object from argparse to add the command to

        Example:
            >>> parser = argparse.ArgumentParser()
            >>> subparsers = parser.add_subparsers()
            >>> BenchCommand.register_command(subparsers)
        """
        parser = subparsers.add_parser(
            "autobench",
            help="Run Auto-benchmarking for Genomic Foundation Models.",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        )
        # Required argument
        parser.add_argument(
            "-b",
            "--benchmark",
            type=str,
            default="RGB",
            choices=["RGB", "PGB", "GUE", "GB", "BEACON"],
            help="Path to the BEACON benchmark root directory.",
        )
        parser.add_argument(
            "-t",
            "--tokenizer",
            type=str,
            default=None,
            help="Path to the tokenizer to use (HF tokenizer ID or local path).",
        )

        parser.add_argument(
            "-m",
            "--model",
            type=str,
            required=True,
            help="Path to the model to evaluate (HF model ID or local path).",
        )

        # Optional arguments
        parser.add_argument(
            "--overwrite",
            type=bool,
            default=False,
            help="Overwrite existing bench results, otherwise resume from benchmark checkpoint.",
        )
        parser.add_argument(
            "--bs_scale",
            type=int,
            default=1,
            help="Batch size scale factor. To increase GPU memory utilization, set to 2 or 4, etc.",
        )
        parser.add_argument(
            "--trainer",
            type=str,
            default="accelerate",
            choices=["native", "accelerate", "hf_trainer"],
            help="Trainer to use for training. \n"
            "Use 'accelerate' for distributed training. Set to false to disable. "
            "You can use 'accelerate config' to customize behavior.\n"
            "Use 'hf_trainer' for Hugging Face Trainer. \n"
            "Set to 'native' to use native PyTorch training loop.\n",
        )

        cls.add_common_arguments(parser)
        parser.set_defaults(func=cls.execute)



[docs]
    @staticmethod
    def execute(args: argparse.Namespace):
        """
        Execute the autobench command with the provided arguments. It handles model and tokenizer loading, benchmark execution,
        and result logging.

        Args:
            args (argparse.Namespace): Parsed command-line arguments containing
                                      benchmark configuration and model settings

        Example:
            >>> args = parser.parse_args(['autobench', '--model', 'model_name'])
            >>> BenchCommand.execute(args)
        """
        fprint("Running benchmark, this may take a while, please be patient...")
        fprint("You can find the logs in the 'autobench_logs' directory.")
        fprint("You can find the metrics in the 'autobench_evaluations' directory.")
        fprint(
            "If you don't intend to use accelerate, please add '--trainer native' to the command."
        )
        fprint(
            "If you want to alter accelerate's behavior, please refer to 'accelerate config' command."
        )
        fprint(
            "If you encounter any issues, please report them on the GitHub repository."
        )
        # 特殊模型处理
        if "multimolecule" in args.model:
            from multimolecule import RnaTokenizer, AutoModelForTokenPrediction

            tokenizer = RnaTokenizer.from_pretrained(args.model)
            model = AutoModelForTokenPrediction.from_pretrained(
                args.model, trust_remote_code=True
            ).base_model
        else:
            tokenizer = args.tokenizer
            model = args.model

        autobench = AutoBench(
            benchmark=args.benchmark,
            config_or_model=model,
            tokenizer=tokenizer,
            overwrite=args.overwrite,
            trainer=args.trainer,
        )
        autobench.run(**vars(args))
        log_dir = Path(args.output_dir) / "autobench_evaluations"
        log_dir.mkdir(parents=True, exist_ok=True)

        timestamp = time.strftime("%Y%m%d-%H%M%S")
        log_file = log_dir / f"bench_{args.benchmark}_{timestamp}.log"

        cmd_base = f"{sys.executable} -m omnigenbench_cli.bench_internal " + " ".join(
            f"--{k}={v}" if v is not None else f"--{k}"
            for k, v in vars(args).items()
            if k not in {"func", "output_dir", "log_level"}
        )

        if platform.system() == "Windows":
            return f"{cmd_base} 2>&1 | powershell -Command \"tee-object -FilePath '{log_file}'\""
        os.system(f"{cmd_base} 2>&1 | tee {log_file}")





[docs]
def register_command(subparsers):
    """
    This function is a convenience wrapper for registering the BenchCommand
    with the argument parser.

    Args:
        subparsers: The subparsers object from argparse to add the command to

    Example:
        >>> parser = argparse.ArgumentParser()
        >>> subparsers = parser.add_subparsers()
        >>> register_command(subparsers)
    """
    BenchCommand.register_command(subparsers)