-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
159 lines (145 loc) · 3.22 KB
/
Copy pathconfig.yaml
File metadata and controls
159 lines (145 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Main configuration file for the web scraper
# Scraper settings
scraper:
mode: production # Options: development, testing, sandbox, production
rate_limit: 5 # Requests per second
max_retries: 3
chunk_size: 1000
user_agent: "Business Address Scraper/1.0"
respect_robots_txt: true
download_delay: 2
concurrent_requests: 1
# Output settings
output:
format: csv # Options: csv, json
directory: data/output
filename_pattern: "business_data_{timestamp}.csv"
chunk_enabled: true
chunk_size: 1000
compress_output: true
# Logging settings
logging:
level: INFO # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
file: logs/scraper.log
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
rotate: true
max_size: 10485760 # 10MB
backup_count: 5
compress: true
# Database settings
database:
enabled: true
type: postgresql
host: db
port: 5432 # Puerto interno de PostgreSQL
name: business_scraper
user: postgres
password: devpassword123
pool_size: 5
max_overflow: 10
timeout: 30
# Cache settings
cache:
enabled: true
type: filesystem # Options: filesystem, redis, memory
directory: .cache
expiration: 86400 # 24 hours
max_size: 1073741824 # 1GB
# Proxy settings
proxy:
enabled: true
rotation: true
check_interval: 300 # 5 minutes
timeout: 10
retry_count: 3
blacklist_time: 3600 # 1 hour
# OCR settings
ocr:
enabled: true
engine: tesseract
language: eng
config_path: config/tesseract
timeout: 30
preprocessing:
resize: true
max_size: 1920
denoise: true
threshold: true
# LLaMA settings
llama:
model_path: ${LLAMA_MODEL_PATH}
max_tokens: 100
temperature: 0.7
top_p: 0.9
context_size: 2048
batch_size: 8
# API settings
api:
duckduckgo:
enabled: true
region: "us-en"
safesearch: "moderate"
timeout: 30
max_results: 10
backend: "api"
# Monitoring settings
monitoring:
enabled: true
prometheus:
enabled: true
port: 8000
metrics_endpoint: ${METRICS_ENDPOINT}
alert_email: ${ALERT_EMAIL}
error_threshold: 100
stats_interval: 60
# Security settings
security:
ssl_verify: true
use_encryption: true
encryption_key: ${ENCRYPTION_KEY}
allowed_domains:
- chamberofcommerce.com
- fda.gov
- yelp.com
- linkedin.com
- ny.gov
- nyc.gov
- mapquest.com
- yellowpages.com
- bbb.org
- bloomberg.com
- manta.com
- maps.google.com
- google.com
# Performance settings
performance:
max_memory: 1073741824 # 1GB
max_cpu_percent: 80
timeout: 300 # 5 minutes
gc_interval: 3600 # 1 hour
# Error handling
error_handling:
retry_codes:
- 429 # Too Many Requests
- 500 # Internal Server Error
- 502 # Bad Gateway
- 503 # Service Unavailable
- 504 # Gateway Timeout
notification:
enabled: true
threshold: 100
interval: 3600 # 1 hour
methods:
- email
- log
fallback:
enabled: true
max_attempts: 3
delay: 60 # 1 minute
# Backup settings
backup:
enabled: true
interval: 86400 # 24 hours
retention: 7 # days
compress: true
path: /var/backup/scraper