Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions .idea/modelinfo-cli.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ Qwen2.5-0.5B 494.0M BF16 8K 1.6 GB ✓
| `--topology` | `--topology nvlink` | Set interconnect topology to calculate exact communication overhead penalties (`nvlink`, `pcie4`, `pcie3`). Defaults to `pcie4`. |
| `--strategy` | `--strategy tp` | Selects the parallelization strategy for multi-GPU setups (`tp` for Tensor Parallelism, `pp` for Pipeline Parallelism). Defaults to `tp`. |
| `--tensors` | `--tensors` | Bypasses the algorithmic speed estimation and forces the tool to fetch all remote shards, displaying an exact size breakdown of every tensor. |
| `--timeout` | `--timeout 30` | Network request timeout in seconds for Hugging Face Hub. Defaults to `10.0`. |
| `-v, --version` | `modelinfo -v` | Show program's version number and exit. |

## Architecture
Expand Down
121 changes: 86 additions & 35 deletions src/modelinfo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import os
import sys
import math
from typing import Sequence
from modelinfo.architecture import identify_architecture_name
from modelinfo.calculator import calculate_footprint
Expand All @@ -12,7 +13,13 @@


class VersionAction(argparse.Action):
def __init__(self, option_strings, dest=argparse.SUPPRESS, default=argparse.SUPPRESS, help="show program's version number and exit"):
def __init__(
self,
option_strings,
dest=argparse.SUPPRESS,
default=argparse.SUPPRESS,
help="show program's version number and exit",
):
super().__init__(
option_strings=option_strings,
dest=dest,
Expand Down Expand Up @@ -41,12 +48,25 @@ def _positive_int(value: str) -> int:
return ivalue


def _positive_float(value: str) -> float:
try:
fvalue = float(value)
except ValueError:
raise argparse.ArgumentTypeError(f"Invalid float value: {value}")

if not math.isfinite(fvalue):
raise argparse.ArgumentTypeError(f"Timeout must be a finite number: {value}")
if fvalue <= 0:
raise argparse.ArgumentTypeError(f"Timeout must be greater than 0: {value}")
return fvalue


def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="modelinfo",
description="High-performance CLI utility to inspect ML model checkpoints and calculate VRAM requirements.",
)

parser.add_argument(
"file",
type=str,
Expand Down Expand Up @@ -107,6 +127,12 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
default=0.9,
help="vLLM gpu_memory_utilization ratio (default 0.9). Reserves 10 percent for PyTorch context.",
)
parser.add_argument(
"--timeout",
type=_positive_float,
default=10.0,
help="Network request timeout in seconds for Hugging Face Hub (default 10.0).",
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
parser.add_argument(
"-v",
"--version",
Expand All @@ -117,49 +143,61 @@ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:


def analyze_model(
file_path: str,
context_override: int | None,
file_path: str,
context_override: int | None,
gpu_count: int = 1,
batch_size: int = 1,
fetch_tensors: bool = False,
topology: str = "pcie4",
strategy: str = "tp",
is_vllm: bool = False,
gpu_vram_gb: float = 0.0,
gpu_util: float = 0.9
gpu_util: float = 0.9,
timeout: float = 10.0,
) -> dict:
tensors = {}
config = None
disk_size = 0.0

file_path_lower = file_path.lower()

if not os.path.exists(file_path) and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):

if not os.path.exists(file_path) and not file_path_lower.endswith(
(".safetensors", ".gguf", ".pt", ".bin", ".index.json")
):
from modelinfo.parsers.huggingface import fetch_huggingface_repo
tensors, config, format_name, disk_size = fetch_huggingface_repo(file_path, fetch_tensors=fetch_tensors)
elif file_path_lower.endswith(".safetensors") or file_path_lower.endswith(".index.json"):

tensors, config, format_name, disk_size = fetch_huggingface_repo(
file_path, fetch_tensors=fetch_tensors, timeout=timeout
)
elif file_path_lower.endswith(".safetensors") or file_path_lower.endswith(
".index.json"
):
tensors = parse_safetensors_header(file_path)
format_name = "SafeTensors"

config_path = os.path.join(os.path.dirname(file_path), "config.json")
if os.path.exists(config_path):
try:
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)
except (json.JSONDecodeError, OSError):
pass

elif file_path_lower.endswith(".gguf"):
tensors = parse_gguf_header(file_path)
format_name = "GGUF"
elif file_path_lower.endswith(".pt") or file_path_lower.endswith(".bin"):
tensors = parse_pytorch_header(file_path)
format_name = "PyTorch"
elif os.path.isdir(file_path):
raise IsADirectoryError(f"'{file_path}' is a directory. Please provide the path to a specific weights file (e.g. .safetensors, .gguf, .pt) inside the directory.")
raise IsADirectoryError(
f"'{file_path}' is a directory. Please provide the path to a specific weights file (e.g. .safetensors, .gguf, .pt) inside the directory."
)
else:
raise ValueError(f"File '{file_path}' not found locally and does not appear to be a Hugging Face repository ID.")

raise ValueError(
f"File '{file_path}' not found locally and does not appear to be a Hugging Face repository ID."
)

max_context = None
if config:
max_context = config.get("max_position_embeddings")
Expand All @@ -168,15 +206,15 @@ def analyze_model(
gen_arch = metadata.get("general.architecture")
if gen_arch:
max_context = metadata.get(f"{gen_arch}.context_length")

is_default_context = False
context_length = context_override
if context_length is None:
context_length = min(8192, max_context) if max_context else 8192
is_default_context = True

footprint = calculate_footprint(
tensors,
tensors,
context_length=context_length,
batch_size=batch_size,
config=config,
Expand All @@ -185,16 +223,16 @@ def analyze_model(
strategy=strategy,
is_vllm=is_vllm,
gpu_vram_bytes=gpu_vram_gb * 1024**3 if gpu_vram_gb else 0.0,
gpu_util=gpu_util
gpu_util=gpu_util,
)
num_layers = footprint["num_layers"]
arch_name = identify_architecture_name(tensors, num_layers, config)

if format_name != "SafeTensors" or os.path.exists(file_path):
disk_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0.0

tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])

return {
"format_name": format_name,
"arch_name": arch_name,
Expand All @@ -211,7 +249,7 @@ def analyze_model(
"strategy": strategy,
"is_vllm": is_vllm,
"gpu_vram_gb": gpu_vram_gb,
"gpu_util": gpu_util
"gpu_util": gpu_util,
}


Expand All @@ -221,52 +259,65 @@ def main(argv: Sequence[str] | None = None) -> int:
gpu_name_display = None
gpu_vram_gb = None
gpu_count = 1

if args.gpu or args.vllm:
target = args.gpu if args.gpu else "auto"
from modelinfo.hardware import resolve_gpu

gpu_name_display, gpu_vram_gb, gpu_count = resolve_gpu(target)

if len(args.file) > 1:
if args.vllm:
console.print("[red]Error: Side-by-side comparison does not currently support the --vllm capacity simulation. Compare models sequentially or remove --vllm.[/red]")
console.print(
"[red]Error: Side-by-side comparison does not currently support the --vllm capacity simulation. Compare models sequentially or remove --vllm.[/red]"
)
return 1

models = []
for model_path in args.file:
info = analyze_model(
model_path,
args.context,
model_path,
args.context,
gpu_count=gpu_count,
batch_size=args.batch_size,
fetch_tensors=args.tensors,
topology=args.topology,
strategy=args.strategy,
is_vllm=args.vllm,
gpu_vram_gb=gpu_vram_gb if gpu_vram_gb else 0.0,
gpu_util=args.gpu_util
gpu_util=args.gpu_util,
timeout=args.timeout,
)
models.append((model_path.split("/")[-1], info))

print_compare_info(models, gpu_vram_gb if gpu_vram_gb else args.max_vram, gpu_name=gpu_name_display)

print_compare_info(
models,
gpu_vram_gb if gpu_vram_gb else args.max_vram,
gpu_name=gpu_name_display,
)
return 0

file_path = args.file[0]

info = analyze_model(
file_path,
args.context,
file_path,
args.context,
gpu_count=gpu_count,
batch_size=args.batch_size,
fetch_tensors=args.tensors,
topology=args.topology,
strategy=args.strategy,
is_vllm=args.vllm,
gpu_vram_gb=gpu_vram_gb if gpu_vram_gb else 0.0,
gpu_util=args.gpu_util
gpu_util=args.gpu_util,
timeout=args.timeout,
)

print_model_info(**info, max_vram_gb=gpu_vram_gb if gpu_vram_gb else args.max_vram, gpu_name=gpu_name_display)
print_model_info(
**info,
max_vram_gb=gpu_vram_gb if gpu_vram_gb else args.max_vram,
gpu_name=gpu_name_display,
)
return 0


Expand Down
Loading