-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathMakefile
More file actions
90 lines (75 loc) · 3.4 KB
/
Copy pathMakefile
File metadata and controls
90 lines (75 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
.PHONY: install test server-global server-earth server-fire server-global-q3 server-global-iq2 check-metal build-metal demo freeze check-models models
install:
uv venv --python 3.11
uv sync
@$(MAKE) check-models
@echo "✅ Environment ready. Run 'source .venv/bin/activate' to activate."
check-models:
@echo "🔍 Checking for model files..."
@if [ ! -f models/weights/tiny-aya-global-q4_k_m.gguf ] || [ ! -f models/weights/tiny-aya-earth-q4_k_m.gguf ] || [ ! -f models/weights/tiny-aya-fire-q4_k_m.gguf ]; then \
echo "⚠️ Model files not found. Downloading (~4.2 GB, this may take 5-10 minutes)..."; \
uv run python models/pull_models.py; \
if [ $$? -eq 0 ]; then \
echo "✅ Models downloaded successfully."; \
else \
echo "❌ Model download failed. Please try manually:"; \
echo " source .venv/bin/activate && python models/pull_models.py"; \
exit 1; \
fi \
else \
echo "✅ Model files already present."; \
fi
models:
@echo "📥 Downloading model files (~4.2 GB total)..."
@echo "This may take 5-10 minutes depending on your internet speed."
uv run python models/pull_models.py
test:
uv run pytest -q
# Detect llama-server binary location
# Prefers system PATH, falls back to the compiled binary in models/llama.cpp/build/bin/
LLAMA_SERVER := $(shell which llama-server 2>/dev/null || echo models/llama.cpp/build/bin/llama-server)
# Thread count for CPU-side work (tokenization, sampling) on Metal/GPU targets.
# GPU handles the heavy matrix ops but CPU still handles sampling.
# 4 threads is optimal for most MacBook Pro M-series chips.
CPU_THREADS := 4
server-global:
$(LLAMA_SERVER) -m models/weights/tiny-aya-global-q4_k_m.gguf \
-ngl 99 --flash-attn auto --cache-prompt \
-c 4096 -t $(CPU_THREADS) --port 8080
server-earth:
$(LLAMA_SERVER) -m models/weights/tiny-aya-earth-q4_k_m.gguf \
-ngl 99 --flash-attn auto --cache-prompt \
-c 4096 -t $(CPU_THREADS) --port 8080
server-fire:
$(LLAMA_SERVER) -m models/weights/tiny-aya-fire-q4_k_m.gguf \
-ngl 99 --flash-attn auto --cache-prompt \
-c 4096 -t $(CPU_THREADS) --port 8080
# CPU users — lower quantization for survivable latency
# NOTE: Q3 and IQ2 model files are not downloaded by default.
# Download manually from HuggingFace before using these targets.
# Everyone on GPU/Metal should use server-global above.
server-global-q3:
llama-server -m models/weights/tiny-aya-global-q3_k_m.gguf \
-ngl 0 --cache-prompt -c 2048 \
-t $(nproc 2>/dev/null || sysctl -n hw.physicalcpu) \
--port 8080
server-global-iq2:
llama-server -m models/weights/tiny-aya-global-iq2_xxs.gguf \
-ngl 0 -c 2048 \
-t $(nproc 2>/dev/null || sysctl -n hw.physicalcpu) \
--port 8080
# Metal verification — run this if generation feels slow on Mac
check-metal:
@echo "Checking Metal support in llama-server..."
@llama-server --version 2>&1 | grep -i "metal\|mps\|gpu" \
|| echo "WARNING: Metal not detected — llama.cpp may not be compiled with Metal support."
@echo "If Metal is missing, recompile: cmake .. -DLLAMA_METAL=ON && make -j$(sysctl -n hw.physicalcpu)"
build-metal:
@echo "Recompiling llama.cpp with Metal support..."
cmake .. -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release
make -j$(sysctl -n hw.physicalcpu)
demo:
uv run python -m ui.app
# requirements.txt removed — uv.lock + pyproject.toml are the source of truth
# Use 'uv sync' to install dependencies
# Use 'uv add <package>' to add new packages (updates pyproject.toml + uv.lock)