levenshtein algo implementiert url similarities
This commit is contained in:
@@ -3139,6 +3139,117 @@ def deactivate_blocking(shop: str, silent: bool = True) -> bool:
|
||||
# =============================================================================
|
||||
# SHOP LOG STATS
|
||||
# =============================================================================
|
||||
|
||||
def levenshtein_similarity(s1: str, s2: str) -> float:
|
||||
"""
|
||||
Berechnet die Ähnlichkeit zweier Strings basierend auf Levenshtein-Distanz.
|
||||
Returns: Float zwischen 0.0 (komplett unterschiedlich) und 1.0 (identisch)
|
||||
"""
|
||||
if s1 == s2:
|
||||
return 1.0
|
||||
len1, len2 = len(s1), len(s2)
|
||||
if len1 == 0 or len2 == 0:
|
||||
return 0.0
|
||||
|
||||
# Optimierung: Bei sehr unterschiedlicher Länge ist Ähnlichkeit gering
|
||||
if abs(len1 - len2) > max(len1, len2) * 0.5:
|
||||
return 0.0
|
||||
|
||||
# Levenshtein-Distanz berechnen (dynamische Programmierung)
|
||||
if len1 > len2:
|
||||
s1, s2, len1, len2 = s2, s1, len2, len1
|
||||
|
||||
current_row = range(len1 + 1)
|
||||
for i in range(1, len2 + 1):
|
||||
previous_row, current_row = current_row, [i] + [0] * len1
|
||||
for j in range(1, len1 + 1):
|
||||
add, delete, change = previous_row[j] + 1, current_row[j-1] + 1, previous_row[j-1]
|
||||
if s1[j-1] != s2[i-1]:
|
||||
change += 1
|
||||
current_row[j] = min(add, delete, change)
|
||||
|
||||
distance = current_row[len1]
|
||||
max_len = max(len1, len2)
|
||||
return 1.0 - (distance / max_len)
|
||||
|
||||
|
||||
def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
|
||||
"""
|
||||
Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
|
||||
1. Vorfiltern nach Präfix (erstes Pfad-Segment)
|
||||
2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
|
||||
|
||||
Args:
|
||||
urls: Dict mit {url: count}
|
||||
similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
|
||||
|
||||
Returns:
|
||||
Dict mit {repräsentative_url: summierter_count}
|
||||
"""
|
||||
if not urls:
|
||||
return {}
|
||||
|
||||
# Schritt 1: Nach erstem Pfad-Segment gruppieren
|
||||
prefix_groups = {}
|
||||
for url, count in urls.items():
|
||||
# Präfix extrahieren (erstes Segment oder bis zum ersten ?)
|
||||
path = url.split('?')[0] if '?' in url else url
|
||||
segments = path.strip('/').split('/')
|
||||
prefix = segments[0][:20] if segments and segments[0] else '/'
|
||||
|
||||
if prefix not in prefix_groups:
|
||||
prefix_groups[prefix] = []
|
||||
prefix_groups[prefix].append((url, count))
|
||||
|
||||
# Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
|
||||
result = {}
|
||||
|
||||
for prefix, url_list in prefix_groups.items():
|
||||
# Sortieren nach Count (häufigste zuerst)
|
||||
url_list.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Für kleine Gruppen: Levenshtein-Gruppierung
|
||||
if len(url_list) <= 100:
|
||||
clusters = [] # Liste von (repräsentant, total_count, [urls])
|
||||
|
||||
for url, count in url_list:
|
||||
# Versuche, zu einem bestehenden Cluster hinzuzufügen
|
||||
added = False
|
||||
for cluster in clusters:
|
||||
rep_url = cluster[0]
|
||||
similarity = levenshtein_similarity(url, rep_url)
|
||||
if similarity >= similarity_threshold:
|
||||
cluster[1] += count
|
||||
cluster[2].append(url)
|
||||
added = True
|
||||
break
|
||||
|
||||
if not added:
|
||||
# Neuer Cluster
|
||||
clusters.append([url, count, [url]])
|
||||
|
||||
# Cluster-Repräsentanten als Ergebnis
|
||||
for rep_url, total_count, members in clusters:
|
||||
# Wenn mehrere URLs zusammengefasst wurden, markiere mit *
|
||||
if len(members) > 1:
|
||||
display_url = rep_url.split('?')[0] # Query-Parameter entfernen
|
||||
if len(display_url) > 50:
|
||||
display_url = display_url[:47] + '...'
|
||||
display_url += f' ({len(members)}x)'
|
||||
else:
|
||||
display_url = rep_url
|
||||
if len(display_url) > 60:
|
||||
display_url = display_url[:57] + '...'
|
||||
result[display_url] = total_count
|
||||
else:
|
||||
# Für große Gruppen: Einfache Präfix-Gruppierung
|
||||
total = sum(c for _, c in url_list)
|
||||
display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
|
||||
result[display_prefix] = total
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Sammelt Statistiken aus dem Shop-Log (v2.5).
|
||||
@@ -3165,8 +3276,9 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||
'unique_bots': 0,
|
||||
'unique_countries': 0,
|
||||
'top_bots': {},
|
||||
'top_ips': {},
|
||||
'top_ips': [],
|
||||
'top_countries': {},
|
||||
'top_requests': {},
|
||||
'human_requests': 0,
|
||||
'bot_requests': 0
|
||||
}
|
||||
@@ -3174,6 +3286,7 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||
ips = {}
|
||||
bots = {}
|
||||
countries = {}
|
||||
uris = {}
|
||||
|
||||
# Log-Datei auswerten
|
||||
if os.path.isfile(log_file):
|
||||
@@ -3235,6 +3348,17 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||
except:
|
||||
pass
|
||||
|
||||
# URI extrahieren
|
||||
uri = None
|
||||
if 'URI: ' in line:
|
||||
try:
|
||||
uri = line.split('URI: ')[1].split(' |')[0].strip()
|
||||
# Leere URIs ignorieren
|
||||
if uri and uri != '/':
|
||||
uris[uri] = uris.get(uri, 0) + 1
|
||||
except:
|
||||
pass
|
||||
|
||||
# Statistiken sammeln
|
||||
if ip:
|
||||
ips[ip] = ips.get(ip, 0) + 1
|
||||
@@ -3293,18 +3417,34 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||
stats['unique_bots'] = len(bots)
|
||||
stats['unique_countries'] = len(countries)
|
||||
|
||||
# Top Bots (max 10)
|
||||
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
# Top Bots (max 20)
|
||||
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||
stats['top_bots'] = dict(sorted_bots)
|
||||
|
||||
# Top IPs (max 10)
|
||||
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
stats['top_ips'] = dict(sorted_ips)
|
||||
# Top IPs (max 20) - mit Country-Info
|
||||
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||
top_ips_list = []
|
||||
for ip, count in sorted_ips:
|
||||
ip_info = get_ip_info(ip)
|
||||
top_ips_list.append({
|
||||
'ip': ip,
|
||||
'count': count,
|
||||
'country': ip_info.get('countryCode', 'XX'),
|
||||
'org': ip_info.get('org', '') or ip_info.get('isp', ''),
|
||||
'asn': ip_info.get('as', '')
|
||||
})
|
||||
stats['top_ips'] = top_ips_list
|
||||
|
||||
# Top Countries (max 10)
|
||||
sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
# Top Countries (max 20)
|
||||
sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||
stats['top_countries'] = dict(sorted_countries)
|
||||
|
||||
# Top Requests (max 20) - mit Ähnlichkeits-Gruppierung
|
||||
if uris:
|
||||
grouped_uris = group_similar_urls(uris, similarity_threshold=0.85)
|
||||
sorted_uris = sorted(grouped_uris.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||
stats['top_requests'] = dict(sorted_uris)
|
||||
|
||||
# Req/min berechnen
|
||||
activation_time = get_shop_activation_time(shop)
|
||||
if activation_time and stats['log_entries'] > 0:
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user