perf optimization

2026-01-09 13:47:09 +01:00
parent 048af8a3dd
commit 80fa35fb74
1 changed files with 38 additions and 60 deletions
--- a/jtl-wafi-agent.py
+++ b/jtl-wafi-agent.py
@@ -3173,15 +3173,14 @@ def levenshtein_similarity(s1: str, s2: str) -> float:
    return 1.0 - (distance / max_len)
-def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
+def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.80) -> Dict[str, int]:
    """
-    Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
+    Gruppiert ähnliche URLs basierend auf schnellem Präfix-Ansatz.
-    1. Vorfiltern nach Präfix (erstes Pfad-Segment)
+    Optimiert für Performance - keine Levenshtein mehr wegen CPU-Last.
    2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
    Args:
        urls: Dict mit {url: count}
-        similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
+        similarity_threshold: Nicht mehr verwendet (Kompatibilität)
    Returns:
        Dict mit {repräsentative_url: summierter_count}
@@ -3189,63 +3188,41 @@ def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85)
    if not urls:
        return {}
-    # Schritt 1: Nach erstem Pfad-Segment gruppieren
+    # Nur Top 500 URLs verarbeiten (Performance)
    sorted_urls = sorted(urls.items(), key=lambda x: x[1], reverse=True)[:500]
    # Nach normalisiertem Pfad gruppieren (Query-Params entfernen, auf 50 Zeichen kürzen)
    prefix_groups = {}
-    for url, count in urls.items():
+    for url, count in sorted_urls:
-        # Präfix extrahieren (erstes Segment oder bis zum ersten ?)
+        # URL normalisieren: Query-Parameter entfernen
        path = url.split('?')[0] if '?' in url else url
-        segments = path.strip('/').split('/')
+        # Auf erste 50 Zeichen kürzen für Gruppierung
-        prefix = segments[0][:20] if segments and segments[0] else '/'
+        normalized = path[:50]
-        if prefix not in prefix_groups:
+        if normalized not in prefix_groups:
-            prefix_groups[prefix] = []
+            prefix_groups[normalized] = {'urls': [], 'total': 0}
-        prefix_groups[prefix].append((url, count))
+        prefix_groups[normalized]['urls'].append(url)
        prefix_groups[normalized]['total'] += count
-    # Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
+    # Ergebnis erstellen
    result = {}
    for normalized, data in prefix_groups.items():
        url_count = len(data['urls'])
        total = data['total']
-    for prefix, url_list in prefix_groups.items():
+        if url_count > 1:
-        # Sortieren nach Count (häufigste zuerst)
+            # Mehrere ähnliche URLs
-        url_list.sort(key=lambda x: x[1], reverse=True)
+            display_url = normalized
-
+            if len(display_url) > 47:
-        # Für kleine Gruppen: Levenshtein-Gruppierung
+                display_url = display_url[:44] + '...'
-        if len(url_list) <= 100:
+            display_url += f' ({url_count}x)'
            clusters = []  # Liste von (repräsentant, total_count, [urls])
            for url, count in url_list:
                # Versuche, zu einem bestehenden Cluster hinzuzufügen
                added = False
                for cluster in clusters:
                    rep_url = cluster[0]
                    similarity = levenshtein_similarity(url, rep_url)
                    if similarity >= similarity_threshold:
                        cluster[1] += count
                        cluster[2].append(url)
                        added = True
                        break
                if not added:
                    # Neuer Cluster
                    clusters.append([url, count, [url]])
            # Cluster-Repräsentanten als Ergebnis
            for rep_url, total_count, members in clusters:
                # Wenn mehrere URLs zusammengefasst wurden, markiere mit *
                if len(members) > 1:
                    display_url = rep_url.split('?')[0]  # Query-Parameter entfernen
                    if len(display_url) > 50:
                        display_url = display_url[:47] + '...'
                    display_url += f' ({len(members)}x)'
                else:
                    display_url = rep_url
                    if len(display_url) > 60:
                        display_url = display_url[:57] + '...'
                result[display_url] = total_count
        else:
-            # Für große Gruppen: Einfache Präfix-Gruppierung
+            # Einzelne URL
-            total = sum(c for _, c in url_list)
+            display_url = data['urls'][0]
-            display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
+            if len(display_url) > 60:
-            result[display_prefix] = total
+                display_url = display_url[:57] + '...'
        result[display_url] = total
    return result
@@ -3421,17 +3398,18 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
    stats['top_bots'] = dict(sorted_bots)
-    # Top IPs (max 20) - mit Country-Info
+    # Top IPs (max 20) - mit Country-Info (nur aus Cache, keine API-Aufrufe!)
    sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
    top_ips_list = []
    for ip, count in sorted_ips:
-        ip_info = get_ip_info(ip)
+        # Nur schnellen Country-Lookup aus lokalem Cache, keine externen API-Aufrufe
        country_code = get_country_for_ip_cached(ip)
        top_ips_list.append({
            'ip': ip,
            'count': count,
-            'country': ip_info.get('countryCode', 'XX'),
+            'country': country_code if country_code else 'XX',
-            'org': ip_info.get('org', '') or ip_info.get('isp', ''),
+            'org': '',
-            'asn': ip_info.get('as', '')
+            'asn': ''
        })
    stats['top_ips'] = top_ips_list