levenshtein algo implementiert url similarities

2026-01-09 13:26:22 +01:00
parent 48d47c29ab
commit 048af8a3dd
2 changed files with 168 additions and 12 deletions
--- a/jtl-wafi-agent.py
+++ b/jtl-wafi-agent.py
@@ -3139,6 +3139,117 @@ def deactivate_blocking(shop: str, silent: bool = True) -> bool:
 # =============================================================================
 # SHOP LOG STATS
 # =============================================================================
 def levenshtein_similarity(s1: str, s2: str) -> float:
    """
    Berechnet die Ähnlichkeit zweier Strings basierend auf Levenshtein-Distanz.
    Returns: Float zwischen 0.0 (komplett unterschiedlich) und 1.0 (identisch)
    """
    if s1 == s2:
        return 1.0
    len1, len2 = len(s1), len(s2)
    if len1 == 0 or len2 == 0:
        return 0.0
    # Optimierung: Bei sehr unterschiedlicher Länge ist Ähnlichkeit gering
    if abs(len1 - len2) > max(len1, len2) * 0.5:
        return 0.0
    # Levenshtein-Distanz berechnen (dynamische Programmierung)
    if len1 > len2:
        s1, s2, len1, len2 = s2, s1, len2, len1
    current_row = range(len1 + 1)
    for i in range(1, len2 + 1):
        previous_row, current_row = current_row, [i] + [0] * len1
        for j in range(1, len1 + 1):
            add, delete, change = previous_row[j] + 1, current_row[j-1] + 1, previous_row[j-1]
            if s1[j-1] != s2[i-1]:
                change += 1
            current_row[j] = min(add, delete, change)
    distance = current_row[len1]
    max_len = max(len1, len2)
    return 1.0 - (distance / max_len)
 def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
    """
    Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
    1. Vorfiltern nach Präfix (erstes Pfad-Segment)
    2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
    Args:
        urls: Dict mit {url: count}
        similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
    Returns:
        Dict mit {repräsentative_url: summierter_count}
    """
    if not urls:
        return {}
    # Schritt 1: Nach erstem Pfad-Segment gruppieren
    prefix_groups = {}
    for url, count in urls.items():
        # Präfix extrahieren (erstes Segment oder bis zum ersten ?)
        path = url.split('?')[0] if '?' in url else url
        segments = path.strip('/').split('/')
        prefix = segments[0][:20] if segments and segments[0] else '/'
        if prefix not in prefix_groups:
            prefix_groups[prefix] = []
        prefix_groups[prefix].append((url, count))
    # Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
    result = {}
    for prefix, url_list in prefix_groups.items():
        # Sortieren nach Count (häufigste zuerst)
        url_list.sort(key=lambda x: x[1], reverse=True)
        # Für kleine Gruppen: Levenshtein-Gruppierung
        if len(url_list) <= 100:
            clusters = []  # Liste von (repräsentant, total_count, [urls])
            for url, count in url_list:
                # Versuche, zu einem bestehenden Cluster hinzuzufügen
                added = False
                for cluster in clusters:
                    rep_url = cluster[0]
                    similarity = levenshtein_similarity(url, rep_url)
                    if similarity >= similarity_threshold:
                        cluster[1] += count
                        cluster[2].append(url)
                        added = True
                        break
                if not added:
                    # Neuer Cluster
                    clusters.append([url, count, [url]])
            # Cluster-Repräsentanten als Ergebnis
            for rep_url, total_count, members in clusters:
                # Wenn mehrere URLs zusammengefasst wurden, markiere mit *
                if len(members) > 1:
                    display_url = rep_url.split('?')[0]  # Query-Parameter entfernen
                    if len(display_url) > 50:
                        display_url = display_url[:47] + '...'
                    display_url += f' ({len(members)}x)'
                else:
                    display_url = rep_url
                    if len(display_url) > 60:
                        display_url = display_url[:57] + '...'
                result[display_url] = total_count
        else:
            # Für große Gruppen: Einfache Präfix-Gruppierung
            total = sum(c for _, c in url_list)
            display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
            result[display_prefix] = total
    return result
 def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    """
    Sammelt Statistiken aus dem Shop-Log (v2.5).
@@ -3165,8 +3276,9 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
        'unique_bots': 0,
        'unique_countries': 0,
        'top_bots': {},
-        'top_ips': {},
+        'top_ips': [],
        'top_countries': {},
        'top_requests': {},
        'human_requests': 0,
        'bot_requests': 0
    }
@@ -3174,6 +3286,7 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    ips = {}
    bots = {}
    countries = {}
    uris = {}
    # Log-Datei auswerten
    if os.path.isfile(log_file):
@@ -3235,6 +3348,17 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
                        except:
                            pass
                    # URI extrahieren
                    uri = None
                    if 'URI: ' in line:
                        try:
                            uri = line.split('URI: ')[1].split(' |')[0].strip()
                            # Leere URIs ignorieren
                            if uri and uri != '/':
                                uris[uri] = uris.get(uri, 0) + 1
                        except:
                            pass
                    # Statistiken sammeln
                    if ip:
                        ips[ip] = ips.get(ip, 0) + 1
@@ -3293,18 +3417,34 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
    stats['unique_bots'] = len(bots)
    stats['unique_countries'] = len(countries)
-    # Top Bots (max 10)
+    # Top Bots (max 20)
-    sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:10]
+    sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
    stats['top_bots'] = dict(sorted_bots)
-    # Top IPs (max 10)
+    # Top IPs (max 20) - mit Country-Info
-    sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:10]
+    sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
-    stats['top_ips'] = dict(sorted_ips)
+    top_ips_list = []
    for ip, count in sorted_ips:
        ip_info = get_ip_info(ip)
        top_ips_list.append({
            'ip': ip,
            'count': count,
            'country': ip_info.get('countryCode', 'XX'),
            'org': ip_info.get('org', '') or ip_info.get('isp', ''),
            'asn': ip_info.get('as', '')
        })
    stats['top_ips'] = top_ips_list
-    # Top Countries (max 10)
+    # Top Countries (max 20)
-    sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]
+    sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:20]
    stats['top_countries'] = dict(sorted_countries)
    # Top Requests (max 20) - mit Ähnlichkeits-Gruppierung
    if uris:
        grouped_uris = group_similar_urls(uris, similarity_threshold=0.85)
        sorted_uris = sorted(grouped_uris.items(), key=lambda x: x[1], reverse=True)[:20]
        stats['top_requests'] = dict(sorted_uris)
    # Req/min berechnen
    activation_time = get_shop_activation_time(shop)
    if activation_time and stats['log_entries'] > 0:
--- a/jtl-wafi-dashboard.py
+++ b/jtl-wafi-dashboard.py