perf optimization

2026-01-09 13:47:09 +01:00
parent 048af8a3dd
commit 80fa35fb74
1 changed files with 38 additions and 60 deletions
--- a/jtl-wafi-agent.py
+++ b/jtl-wafi-agent.py
@@ -3173,15 +3173,14 @@ def levenshtein_similarity(s1: str, s2: str) -> float:
    return 1.0 - (distance / max_len)


-def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
+def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.80) -> Dict[str, int]:
    """
-    Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
-    1. Vorfiltern nach Präfix (erstes Pfad-Segment)
-    2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
+    Gruppiert ähnliche URLs basierend auf schnellem Präfix-Ansatz.
+    Optimiert für Performance - keine Levenshtein mehr wegen CPU-Last.

    Args:
        urls: Dict mit {url: count}
-        similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
+        similarity_threshold: Nicht mehr verwendet (Kompatibilität)

    Returns:
        Dict mit {repräsentative_url: summierter_count}
@@ -3189,63 +3188,41 @@ def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85)
    if not urls:
        return {}

-    # Schritt 1: Nach erstem Pfad-Segment gruppieren
+    # Nur Top 500 URLs verarbeiten (Performance)
+    sorted_urls = sorted(urls.items(), key=lambda x: x[1], reverse=True)[:500]
+
+    # Nach normalisiertem Pfad gruppieren (Query-Params entfernen, auf 50 Zeichen kürzen)
    prefix_groups = {}
-    for url, count in urls.items():
-        # Präfix extrahieren (erstes Segment oder bis zum ersten ?)
+    for url, count in sorted_urls:
+        # URL normalisieren: Query-Parameter entfernen
        path = url.split('?')[0] if '?' in url else url
-        segments = path.strip('/').split('/')
-        prefix = segments[0][:20] if segments and segments[0] else '/'
+        # Auf erste 50 Zeichen kürzen für Gruppierung
+        normalized = path[:50]

-        if prefix not in prefix_groups:
-            prefix_groups[prefix] = []
-        prefix_groups[prefix].append((url, count))
+        if normalized not in prefix_groups:
+            prefix_groups[normalized] = {'urls': [], 'total': 0}
+        prefix_groups[normalized]['urls'].append(url)
+        prefix_groups[normalized]['total'] += count

-    # Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
+    # Ergebnis erstellen
    result = {}
+    for normalized, data in prefix_groups.items():
+        url_count = len(data['urls'])
+        total = data['total']

-    for prefix, url_list in prefix_groups.items():
-        # Sortieren nach Count (häufigste zuerst)
-        url_list.sort(key=lambda x: x[1], reverse=True)
-
-        # Für kleine Gruppen: Levenshtein-Gruppierung
-        if len(url_list) <= 100:
-            clusters = []  # Liste von (repräsentant, total_count, [urls])
-
-            for url, count in url_list:
-                # Versuche, zu einem bestehenden Cluster hinzuzufügen
-                added = False
-                for cluster in clusters:
-                    rep_url = cluster[0]
-                    similarity = levenshtein_similarity(url, rep_url)
-                    if similarity >= similarity_threshold:
-                        cluster[1] += count
-                        cluster[2].append(url)
-                        added = True
-                        break
-
-                if not added:
-                    # Neuer Cluster
-                    clusters.append([url, count, [url]])
-
-            # Cluster-Repräsentanten als Ergebnis
-            for rep_url, total_count, members in clusters:
-                # Wenn mehrere URLs zusammengefasst wurden, markiere mit *
-                if len(members) > 1:
-                    display_url = rep_url.split('?')[0]  # Query-Parameter entfernen
-                    if len(display_url) > 50:
-                        display_url = display_url[:47] + '...'
-                    display_url += f' ({len(members)}x)'
-                else:
-                    display_url = rep_url
-                    if len(display_url) > 60:
-                        display_url = display_url[:57] + '...'
-                result[display_url] = total_count
+        if url_count > 1:
+            # Mehrere ähnliche URLs
+            display_url = normalized
+            if len(display_url) > 47:
+                display_url = display_url[:44] + '...'
+            display_url += f' ({url_count}x)'
        else:
-            # Für große Gruppen: Einfache Präfix-Gruppierung
-            total = sum(c for _, c in url_list)
-            display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
-            result[display_prefix] = total
+            # Einzelne URL
+            display_url = data['urls'][0]
+            if len(display_url) > 60:
+                display_url = display_url[:57] + '...'
+
+        result[display_url] = total

    return result

@@ -3421,17 +3398,18 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
    stats['top_bots'] = dict(sorted_bots)

-    # Top IPs (max 20) - mit Country-Info
+    # Top IPs (max 20) - mit Country-Info (nur aus Cache, keine API-Aufrufe!)
    sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
    top_ips_list = []
    for ip, count in sorted_ips:
-        ip_info = get_ip_info(ip)
+        # Nur schnellen Country-Lookup aus lokalem Cache, keine externen API-Aufrufe
+        country_code = get_country_for_ip_cached(ip)
        top_ips_list.append({
            'ip': ip,
            'count': count,
-            'country': ip_info.get('countryCode', 'XX'),
-            'org': ip_info.get('org', '') or ip_info.get('isp', ''),
-            'asn': ip_info.get('as', '')
+            'country': country_code if country_code else 'XX',
+            'org': '',
+            'asn': ''
        })
    stats['top_ips'] = top_ips_list