Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@
Changes
=======

Unreleased

- feat: the event preprocessors go through a single counter-robots classifier,
exposed as the cached ``current_stats.visitor_classifier`` property and built from
``STATS_VISITOR_CLASSIFIER`` (an import path or ``app -> Classifier``; the default
is the COUNTER baseline plus the extended preset). ``flag_robots`` /
``flag_machines`` keep setting ``is_robot`` / ``is_machine`` as before.
- feat: ``exclude_datacenter_browser`` preprocessor drops events whose user agent
looks like a browser but whose IP resolves to a datacenter/hosting ASN, catching
automation faking a browser from cloud infrastructure. It writes nothing to the
event (no ``is_datacenter`` field) and only excludes, returning ``None`` or the
document unchanged. It must run before ``anonymize_user``. Datacenter resolution
is enabled by pointing ``STATS_VISITOR_ASN_DB`` at a GeoLite2-ASN mmdb (the
``counter-robots[asn]`` extra). invenio-stats holds no lists. Requires
counter-robots>=2026.6.

Version v6.1.3 (released 2026-04-30)

- fix(stats): warm event cache on finalization
Expand Down
3 changes: 2 additions & 1 deletion invenio_stats/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def agg_iter(self, dt, previous_bookmark):
"""Aggregate and return dictionary to be indexed in the search engine."""
rounded_dt = format_range_dt(dt, self.interval)
agg_query = (
dsl.Search(using=self.client, index=self.event_index).filter(
dsl.Search(using=self.client, index=self.event_index)
.filter(
# Filter for the specific interval (hour, day, month)
"term",
timestamp=rounded_dt,
Expand Down
17 changes: 17 additions & 0 deletions invenio_stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,20 @@
stripped via ``datetime.replace(tzinfo=None)``). Set to ``True`` to use
timezone-aware UTC datetimes with explicit UTC timezone information.
"""

STATS_VISITOR_CLASSIFIER = None
"""Factory for the visitor classifier used by the event preprocessors.

An import path or callable ``app -> counter_robots.Classifier``. When unset, the
default factory builds the COUNTER baseline plus the extended preset (and a
datacenter resolver when ``STATS_VISITOR_ASN_DB`` is set). Point this at your own
factory to add instance-specific lists. The classifier is built once per
application, cached on the extension state.
"""

STATS_VISITOR_ASN_DB = None
"""Path to a GeoLite2-ASN mmdb, used by the default visitor-classifier factory.

Enables datacenter detection (needs the ``counter-robots[asn]`` extra). The mmdb is
provided and refreshed by the deployment; it is not shipped.
"""
34 changes: 34 additions & 0 deletions invenio_stats/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@
_Query = namedtuple("Query", ["name", "cls", "permission_factory", "params"])


def default_visitor_classifier(app):
"""Build the default visitor classifier.

Composes the COUNTER baseline with the extended preset, and adds datacenter IP
resolution when ``STATS_VISITOR_ASN_DB`` points at a GeoLite2-ASN database
(requires the ``counter-robots[asn]`` extra).
"""
from counter_robots import (
ClassifierBuilder,
counter_preset,
extended_preset,
maxminddb_resolver,
)

builder = ClassifierBuilder().use(counter_preset).use(extended_preset)
asn_db = app.config.get("STATS_VISITOR_ASN_DB")
if asn_db:
builder.asn_resolver(maxminddb_resolver(asn_db))
return builder.build()


class _InvenioStatsState(object):
"""State object for Invenio stats."""

Expand Down Expand Up @@ -129,6 +150,19 @@ def permission_factory(self):
"""Load default permission factory for Buckets collections."""
return load_or_import_from_config("STATS_PERMISSION_FACTORY", app=self.app)

@cached_property
def visitor_classifier(self):
"""The (cached) counter-robots classifier used by the preprocessors.

Built from ``STATS_VISITOR_CLASSIFIER`` (an import path or callable
``app -> counter_robots.Classifier``); when unset it is
:func:`default_visitor_classifier`.
"""
factory = self.app.config.get("STATS_VISITOR_CLASSIFIER")
if factory:
return obj_or_import_string(factory)(self.app)
return default_visitor_classifier(self.app)

def publish(self, event_type, events):
"""Publish events."""
assert event_type in self.events
Expand Down
37 changes: 34 additions & 3 deletions invenio_stats/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
from functools import partial
from time import mktime

from counter_robots import is_machine, is_robot
from dateutil import parser
from flask import current_app
from invenio_base.utils import obj_or_import_string
from invenio_search import current_search_client
from invenio_search.engine import search
from invenio_search.utils import prefix_index

from .proxies import current_stats
from .utils import get_anonymization_salt, get_geoip


Expand Down Expand Up @@ -99,7 +99,9 @@ def flag_robots(doc, exclude=False):
into robots and machines by `the Make Data Count project
<https://github.com/CDLUC3/Make-Data-Count/tree/master/user-agents>`_.
"""
doc["is_robot"] = "user_agent" in doc and is_robot(doc["user_agent"])
doc["is_robot"] = "user_agent" in doc and current_stats.visitor_classifier.is_robot(
doc["user_agent"]
)
if exclude and doc["is_robot"]:
return None
return doc
Expand All @@ -120,7 +122,10 @@ def flag_machines(doc, exclude=False):
<https://github.com/CDLUC3/Make-Data-Count/tree/master/user-agents>`_.

"""
doc["is_machine"] = "user_agent" in doc and is_machine(doc["user_agent"])
doc["is_machine"] = (
"user_agent" in doc
and current_stats.visitor_classifier.is_machine(doc["user_agent"])
)
if exclude and doc["is_machine"]:
return None
return doc
Expand All @@ -130,6 +135,32 @@ def flag_machines(doc, exclude=False):
"""Filter out machine events."""


def exclude_datacenter_browser(doc):
"""Drop browser events that originate from datacenter/hosting networks.

Complements :func:`flag_robots`: counter-robots catches self-identifying bots
by their user agent, while this catches automation that fakes a real browser
user agent but runs from cloud or hosting infrastructure. The event is dropped
(``None`` is returned) when its user agent looks like a browser and its IP
resolves to a datacenter ASN; otherwise the document is returned unchanged.
Nothing is written to the event.

Datacenter resolution is only active when the visitor classifier has an ASN
resolver configured (``STATS_VISITOR_ASN_DB``); otherwise this is a no-op. It
must run before :func:`anonymize_user`, which removes ``ip_address`` from the
event.
"""
ip = doc.get("ip_address")
classifier = current_stats.visitor_classifier
if (
ip
and classifier.is_browser(doc.get("user_agent", ""))
and classifier.is_datacenter_ip(ip)
):
return None
return doc


def hash_id(iso_timestamp, msg):
"""Generate event id, optimized for the search engine."""
return "{0}-{1}".format(
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ packages = find:
python_requires = >=3.7
zip_safe = False
install_requires =
counter-robots>=2018.6
counter-robots>=2026.6
invenio-base>=2.0.0,<3.0.0
invenio-cache>=3.0.0,<4.0.0
invenio-celery>=2.0.0,<3.0.0
Expand Down
79 changes: 79 additions & 0 deletions tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,82 @@ def _raises_on_second_call(doc):
assert get_queue_size("stats-file-download") == 0
assert search_obj.index("events-stats-file-download").count() == 3
assert search_obj.index("events-stats-file-download-2018-01").count() == 3


BROWSER_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
)


def _classifier(asn_by_ip=None):
"""Build a real counter-robots classifier (COUNTER baseline + extended preset)."""
from counter_robots import ClassifierBuilder, counter_preset, extended_preset

builder = ClassifierBuilder().use(counter_preset).use(extended_preset)
if asn_by_ip is not None:
builder.asn_resolver(lambda ip: asn_by_ip.get(ip))
return builder.build()


def _with_visitor_classifier(classifier):
"""App context exposing ``current_stats.visitor_classifier``."""
from types import SimpleNamespace

from flask import Flask

app = Flask(__name__)
app.extensions["invenio-stats"] = SimpleNamespace(visitor_classifier=classifier)
return app.app_context()


def test_flag_robots_uses_classifier():
"""flag_robots goes through the visitor classifier (extended preset here)."""
import invenio_stats.processors as procs

with _with_visitor_classifier(_classifier()):
# CamelCase bot the case-sensitive COUNTER baseline misses
assert procs.flag_robots({"user_agent": "YisouSpider"})["is_robot"] is True
assert procs.flag_robots({"user_agent": BROWSER_UA})["is_robot"] is False


def test_flag_machines_uses_classifier():
import invenio_stats.processors as procs

with _with_visitor_classifier(_classifier()):
ua = {"user_agent": "Python/3.10 aiohttp/3.10.5"}
assert procs.flag_machines(ua)["is_machine"] is True
assert procs.flag_machines({"user_agent": BROWSER_UA})["is_machine"] is False


def test_exclude_datacenter_browser():
"""Browser events from datacenter IPs are dropped; others pass unchanged."""
import invenio_stats.processors as procs

# AS16509 AWS (datacenter), AS714 Apple (allow-listed), AS3320 Deutsche Telekom.
asn_by_ip = {"1.1.1.1": 16509, "2.2.2.2": 714, "3.3.3.3": 3320}
with _with_visitor_classifier(_classifier(asn_by_ip)):

def excl(ip, ua):
return procs.exclude_datacenter_browser(
{"ip_address": ip, "user_agent": ua}
)

assert excl("1.1.1.1", BROWSER_UA) is None # browser from AWS -> dropped
# everything else passes through unchanged, with no field written
kept = excl("3.3.3.3", BROWSER_UA) # eyeball ISP
assert kept is not None and "is_datacenter" not in kept
assert excl("2.2.2.2", BROWSER_UA) is not None # Apple, allow-listed
assert excl("1.1.1.1", "python-requests/2.31") is not None # not a browser
assert excl(None, BROWSER_UA) is not None # no IP


def test_exclude_datacenter_browser_noop_without_resolver():
"""With no ASN resolver, the datacenter check is a no-op and writes no field."""
import invenio_stats.processors as procs

with _with_visitor_classifier(_classifier()):
doc = {"ip_address": "1.1.1.1", "user_agent": BROWSER_UA}
result = procs.exclude_datacenter_browser(doc)
assert result is doc
assert "is_datacenter" not in result
Loading