olive-recipes/Qwen-Qwen3.5-9B/builtin/optimize.py at main · microsoft/olive-recipes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""End-to-end optimization pipeline for Qwen3.5 ONNX models.

Exports three sub-models (vision encoder, text embedding, text decoder),
applies graph optimizations and INT4 quantization via Olive passes.

Usage:
    python optimize.py --config-dir cpu_and_mobile --device cpu
    python optimize.py --config-dir cpu_and_mobile --device cpu --skip-export
"""
import argparse
import json
import logging
from pathlib import Path

logging.getLogger("onnxscript").setLevel(logging.WARNING)
logging.getLogger("onnx_ir").setLevel(logging.WARNING)

MODELS_DIR = "models"


def export_models(config_dir: str):
    """Run Olive for all 3 sub-models (embedding, text, vision)."""
    from olive import run

    config_path = Path(config_dir)
    print(f"=== Running Olive pipelines (configs from {config_path}) ===")
    for config in ("embedding.json", "text.json", "vision.json"):
        print(f"  Running {config}...")
        run(str(config_path / config))
    print()


def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"):
    """Patch genai_config.json with embedding/vision sections and processor_config."""
    config_path = Path(output_dir) / "genai_config.json"

    with open(config_path) as f:
        config = json.load(f)

    if device == "gpu":
        provider_options = [
            {"cuda": {"enable_cuda_graph": "1", "enable_skip_layer_norm_strict_mode": "1"}}
        ]
        # Vision model has Loop nodes (one per ViT block) which are incompatible
        # with CUDA graph capture, so disable it for vision and embedding only.
        vision_provider_options = [
            {"cuda": {"enable_cuda_graph": "0", "enable_skip_layer_norm_strict_mode": "1"}}
        ]
    elif device == "webgpu":
        provider_options = [{"webgpu": {}}]
        vision_provider_options = [{"webgpu": {}}]
    else:
        provider_options = []
        vision_provider_options = []

    session_options = {"log_id": "onnxruntime-genai", "provider_options": provider_options}
    vision_session_options = {"log_id": "onnxruntime-genai", "provider_options": vision_provider_options}

    config["model"]["decoder"]["session_options"] = session_options

    config["model"]["embedding"] = {
        "filename": "embedding.onnx",
        "inputs": {"input_ids": "input_ids", "image_features": "image_features"},
        "outputs": {"inputs_embeds": "inputs_embeds"},
        "session_options": vision_session_options,
    }

    config["model"]["vision"] = {
        "filename": "vision.onnx",
        "config_filename": "processor_config.json",
        "spatial_merge_size": 2,
        "tokens_per_second": 2.0,
        "patch_size": 16,
        "inputs": {"pixel_values": "pixel_values", "image_grid_thw": "image_grid_thw"},
        "outputs": {"image_features": "image_features"},
        "session_options": vision_session_options,
    }

    config["model"]["bos_token_id"] = 248044
    config["model"]["eos_token_id"] = [248044]
    config["model"]["pad_token_id"] = 248044
    config["model"]["image_token_id"] = 248056
    config["model"]["video_token_id"] = 248057
    config["model"]["vision_start_token_id"] = 248053

    config["search"]["top_k"] = 1
    if config["search"].get("top_p") is None:
        config["search"]["top_p"] = 1.0

    with open(config_path, "w") as f:
        json.dump(config, f, indent=4)
    print(f"  Updated {config_path}")

    processor_config = {
        "processor": {
            "name": "qwen2_5_image_processor",
            "transforms": [
                {"operation": {"name": "decode_image", "type": "DecodeImage", "attrs": {"color_space": "RGB"}}},
                {"operation": {"name": "convert_to_rgb", "type": "ConvertRGB"}},
                {"operation": {"name": "resize", "type": "Resize", "attrs": {
                    "width": 960, "height": 672, "smart_resize": 1,
                    "min_pixels": 65536, "max_pixels": 16777216, "patch_size": 16, "merge_size": 2,
                }}},
                {"operation": {"name": "rescale", "type": "Rescale", "attrs": {
                    "rescale_factor": 0.00392156862745098,
                }}},
                {"operation": {"name": "normalize", "type": "Normalize", "attrs": {
                    "mean": [0.5, 0.5, 0.5], "std": [0.5, 0.5, 0.5], "qwen2_5_vl": 1,
                }}},
                {"operation": {"name": "patch_image", "type": "PatchImage", "attrs": {
                    "patch_size": 16, "temporal_patch_size": 2, "merge_size": 2,
                }}},
            ],
        }
    }

    processor_path = Path(output_dir) / "processor_config.json"
    with open(processor_path, "w") as f:
        json.dump(processor_config, f, indent=2)
    print(f"  Created {processor_path}")


def fix_tokenizer(output_dir: str = MODELS_DIR):
    """Fix tokenizer.json for C++ std::regex compatibility.

    Qwen3.5's tokenizer uses Unicode property escapes (\\p{L}, \\p{N}) in its
    Split pre-tokenizer, which aren't supported by std::regex in onnxruntime-genai.
    Remove the Split and keep only ByteLevel with use_regex=True.
    """
    tk_path = Path(output_dir) / "tokenizer.json"
    if not tk_path.exists():
        return
    tk = json.loads(tk_path.read_text(encoding="utf-8"))
    pt = tk.get("pre_tokenizer", {})
    if pt.get("type") == "Sequence":
        pt["pretokenizers"] = [s for s in pt["pretokenizers"] if s.get("type") == "ByteLevel"]
        for s in pt["pretokenizers"]:
            s["use_regex"] = True
    tk_path.write_text(json.dumps(tk, ensure_ascii=False), encoding="utf-8")

    tc_path = Path(output_dir) / "tokenizer_config.json"
    if tc_path.exists():
        tc = json.loads(tc_path.read_text(encoding="utf-8"))
        tc["tokenizer_class"] = "Qwen2Tokenizer"
        tc_path.write_text(json.dumps(tc, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"  Fixed tokenizer for C++ std::regex compatibility")


def main():
    parser = argparse.ArgumentParser(description="Optimize Qwen3.5 ONNX models")
    parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu")
    parser.add_argument("--config-dir", default="cpu_and_mobile")
    parser.add_argument("--skip-export", action="store_true")
    parser.add_argument("--models-dir", default=None)
    args = parser.parse_args()

    models_dir = args.models_dir or str(Path(args.config_dir) / MODELS_DIR)

    if not args.skip_export:
        export_models(args.config_dir)

    print("=== Generating configs ===")
    update_genai_config(output_dir=models_dir, device=args.device)
    fix_tokenizer(output_dir=models_dir)
    print()
    print("Done.")


if __name__ == "__main__":
    main()