docunative/Makefile at main · docunative-AI/docunative · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
.PHONY: install test server-global server-earth server-fire server-global-q3 server-global-iq2 check-metal build-metal demo freeze check-models models

install:
	uv venv --python 3.11
	uv sync
	@$(MAKE) check-models
	@echo "✅ Environment ready. Run 'source .venv/bin/activate' to activate."

check-models:
	@echo "🔍 Checking for model files..."
	@if [ ! -f models/weights/tiny-aya-global-q4_k_m.gguf ] || [ ! -f models/weights/tiny-aya-earth-q4_k_m.gguf ] || [ ! -f models/weights/tiny-aya-fire-q4_k_m.gguf ]; then \
		echo "⚠️  Model files not found. Downloading (~4.2 GB, this may take 5-10 minutes)..."; \
		uv run python models/pull_models.py; \
		if [ $$? -eq 0 ]; then \
			echo "✅ Models downloaded successfully."; \
		else \
			echo "❌ Model download failed. Please try manually:"; \
			echo "   source .venv/bin/activate && python models/pull_models.py"; \
			exit 1; \
		fi \
	else \
		echo "✅ Model files already present."; \
	fi

models:
	@echo "📥 Downloading model files (~4.2 GB total)..."
	@echo "This may take 5-10 minutes depending on your internet speed."
	uv run python models/pull_models.py

test:
	uv run pytest -q

# Detect llama-server binary location
# Prefers system PATH, falls back to the compiled binary in models/llama.cpp/build/bin/
LLAMA_SERVER := $(shell which llama-server 2>/dev/null || echo models/llama.cpp/build/bin/llama-server)

# Thread count for CPU-side work (tokenization, sampling) on Metal/GPU targets.
# GPU handles the heavy matrix ops but CPU still handles sampling.
# 4 threads is optimal for most MacBook Pro M-series chips.
CPU_THREADS := 4

server-global:
	$(LLAMA_SERVER) -m models/weights/tiny-aya-global-q4_k_m.gguf \
		-ngl 99 --flash-attn auto --cache-prompt \
		-c 4096 -t $(CPU_THREADS) --port 8080

server-earth:
	$(LLAMA_SERVER) -m models/weights/tiny-aya-earth-q4_k_m.gguf \
		-ngl 99 --flash-attn auto --cache-prompt \
		-c 4096 -t $(CPU_THREADS) --port 8080

server-fire:
	$(LLAMA_SERVER) -m models/weights/tiny-aya-fire-q4_k_m.gguf \
		-ngl 99 --flash-attn auto --cache-prompt \
		-c 4096 -t $(CPU_THREADS) --port 8080

# CPU users — lower quantization for survivable latency
# NOTE: Q3 and IQ2 model files are not downloaded by default.
# Download manually from HuggingFace before using these targets.
# Everyone on GPU/Metal should use server-global above.
server-global-q3:
	llama-server -m models/weights/tiny-aya-global-q3_k_m.gguf \
		-ngl 0 --cache-prompt -c 2048 \
		-t $(nproc 2>/dev/null || sysctl -n hw.physicalcpu) \
		--port 8080

server-global-iq2:
	llama-server -m models/weights/tiny-aya-global-iq2_xxs.gguf \
		-ngl 0 -c 2048 \
		-t $(nproc 2>/dev/null || sysctl -n hw.physicalcpu) \
		--port 8080

# Metal verification — run this if generation feels slow on Mac
check-metal:
	@echo "Checking Metal support in llama-server..."
	@llama-server --version 2>&1 | grep -i "metal\|mps\|gpu" \
		|| echo "WARNING: Metal not detected — llama.cpp may not be compiled with Metal support."
	@echo "If Metal is missing, recompile: cmake .. -DLLAMA_METAL=ON && make -j$(sysctl -n hw.physicalcpu)"

build-metal:
	@echo "Recompiling llama.cpp with Metal support..."
	cmake .. -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release
	make -j$(sysctl -n hw.physicalcpu)

demo:
	uv run python -m ui.app

# requirements.txt removed — uv.lock + pyproject.toml are the source of truth
# Use 'uv sync' to install dependencies
# Use 'uv add <package>' to add new packages (updates pyproject.toml + uv.lock)