levenshtein algo implementiert url similarities

2026-01-09 13:26:22 +01:00
parent 48d47c29ab
commit 048af8a3dd
2 changed files with 168 additions and 12 deletions
--- a/jtl-wafi-agent.py
+++ b/jtl-wafi-agent.py
@@ -3139,6 +3139,117 @@ def deactivate_blocking(shop: str, silent: bool = True) -> bool:
 # =============================================================================
 # SHOP LOG STATS
 # =============================================================================
+
+def levenshtein_similarity(s1: str, s2: str) -> float:
+    """
+    Berechnet die Ähnlichkeit zweier Strings basierend auf Levenshtein-Distanz.
+    Returns: Float zwischen 0.0 (komplett unterschiedlich) und 1.0 (identisch)
+    """
+    if s1 == s2:
+        return 1.0
+    len1, len2 = len(s1), len(s2)
+    if len1 == 0 or len2 == 0:
+        return 0.0
+
+    # Optimierung: Bei sehr unterschiedlicher Länge ist Ähnlichkeit gering
+    if abs(len1 - len2) > max(len1, len2) * 0.5:
+        return 0.0
+
+    # Levenshtein-Distanz berechnen (dynamische Programmierung)
+    if len1 > len2:
+        s1, s2, len1, len2 = s2, s1, len2, len1
+
+    current_row = range(len1 + 1)
+    for i in range(1, len2 + 1):
+        previous_row, current_row = current_row, [i] + [0] * len1
+        for j in range(1, len1 + 1):
+            add, delete, change = previous_row[j] + 1, current_row[j-1] + 1, previous_row[j-1]
+            if s1[j-1] != s2[i-1]:
+                change += 1
+            current_row[j] = min(add, delete, change)
+
+    distance = current_row[len1]
+    max_len = max(len1, len2)
+    return 1.0 - (distance / max_len)
+
+
+def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
+    """
+    Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
+    1. Vorfiltern nach Präfix (erstes Pfad-Segment)
+    2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
+
+    Args:
+        urls: Dict mit {url: count}
+        similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
+
+    Returns:
+        Dict mit {repräsentative_url: summierter_count}
+    """
+    if not urls:
+        return {}
+
+    # Schritt 1: Nach erstem Pfad-Segment gruppieren
+    prefix_groups = {}
+    for url, count in urls.items():
+        # Präfix extrahieren (erstes Segment oder bis zum ersten ?)
+        path = url.split('?')[0] if '?' in url else url
+        segments = path.strip('/').split('/')
+        prefix = segments[0][:20] if segments and segments[0] else '/'
+
+        if prefix not in prefix_groups:
+            prefix_groups[prefix] = []
+        prefix_groups[prefix].append((url, count))
+
+    # Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
+    result = {}
+
+    for prefix, url_list in prefix_groups.items():
+        # Sortieren nach Count (häufigste zuerst)
+        url_list.sort(key=lambda x: x[1], reverse=True)
+
+        # Für kleine Gruppen: Levenshtein-Gruppierung
+        if len(url_list) <= 100:
+            clusters = []  # Liste von (repräsentant, total_count, [urls])
+
+            for url, count in url_list:
+                # Versuche, zu einem bestehenden Cluster hinzuzufügen
+                added = False
+                for cluster in clusters:
+                    rep_url = cluster[0]
+                    similarity = levenshtein_similarity(url, rep_url)
+                    if similarity >= similarity_threshold:
+                        cluster[1] += count
+                        cluster[2].append(url)
+                        added = True
+                        break
+
+                if not added:
+                    # Neuer Cluster
+                    clusters.append([url, count, [url]])
+
+            # Cluster-Repräsentanten als Ergebnis
+            for rep_url, total_count, members in clusters:
+                # Wenn mehrere URLs zusammengefasst wurden, markiere mit *
+                if len(members) > 1:
+                    display_url = rep_url.split('?')[0]  # Query-Parameter entfernen
+                    if len(display_url) > 50:
+                        display_url = display_url[:47] + '...'
+                    display_url += f' ({len(members)}x)'
+                else:
+                    display_url = rep_url
+                    if len(display_url) > 60:
+                        display_url = display_url[:57] + '...'
+                result[display_url] = total_count
+        else:
+            # Für große Gruppen: Einfache Präfix-Gruppierung
+            total = sum(c for _, c in url_list)
+            display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
+            result[display_prefix] = total
+
+    return result
+
+
 def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    """
    Sammelt Statistiken aus dem Shop-Log (v2.5).
@@ -3165,8 +3276,9 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
        'unique_bots': 0,
        'unique_countries': 0,
        'top_bots': {},
-        'top_ips': {},
+        'top_ips': [],
        'top_countries': {},
+        'top_requests': {},
        'human_requests': 0,
        'bot_requests': 0
    }
@@ -3174,6 +3286,7 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    ips = {}
    bots = {}
    countries = {}
+    uris = {}

    # Log-Datei auswerten
    if os.path.isfile(log_file):
@@ -3235,6 +3348,17 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
                        except:
                            pass

+                    # URI extrahieren
+                    uri = None
+                    if 'URI: ' in line:
+                        try:
+                            uri = line.split('URI: ')[1].split(' |')[0].strip()
+                            # Leere URIs ignorieren
+                            if uri and uri != '/':
+                                uris[uri] = uris.get(uri, 0) + 1
+                        except:
+                            pass
+
                    # Statistiken sammeln
                    if ip:
                        ips[ip] = ips.get(ip, 0) + 1
@@ -3293,18 +3417,34 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    stats['unique_bots'] = len(bots)
    stats['unique_countries'] = len(countries)

-    # Top Bots (max 10)
-    sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:10]
+    # Top Bots (max 20)
+    sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
    stats['top_bots'] = dict(sorted_bots)

-    # Top IPs (max 10)
-    sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:10]
-    stats['top_ips'] = dict(sorted_ips)
+    # Top IPs (max 20) - mit Country-Info
+    sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
+    top_ips_list = []
+    for ip, count in sorted_ips:
+        ip_info = get_ip_info(ip)
+        top_ips_list.append({
+            'ip': ip,
+            'count': count,
+            'country': ip_info.get('countryCode', 'XX'),
+            'org': ip_info.get('org', '') or ip_info.get('isp', ''),
+            'asn': ip_info.get('as', '')
+        })
+    stats['top_ips'] = top_ips_list

-    # Top Countries (max 10)
-    sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]
+    # Top Countries (max 20)
+    sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:20]
    stats['top_countries'] = dict(sorted_countries)

+    # Top Requests (max 20) - mit Ähnlichkeits-Gruppierung
+    if uris:
+        grouped_uris = group_similar_urls(uris, similarity_threshold=0.85)
+        sorted_uris = sorted(grouped_uris.items(), key=lambda x: x[1], reverse=True)[:20]
+        stats['top_requests'] = dict(sorted_uris)
+
    # Req/min berechnen
    activation_time = get_shop_activation_time(shop)
    if activation_time and stats['log_entries'] > 0: