levenshtein algo implementiert url similarities
This commit is contained in:
@@ -3139,6 +3139,117 @@ def deactivate_blocking(shop: str, silent: bool = True) -> bool:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
# SHOP LOG STATS
|
# SHOP LOG STATS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
def levenshtein_similarity(s1: str, s2: str) -> float:
|
||||||
|
"""
|
||||||
|
Berechnet die Ähnlichkeit zweier Strings basierend auf Levenshtein-Distanz.
|
||||||
|
Returns: Float zwischen 0.0 (komplett unterschiedlich) und 1.0 (identisch)
|
||||||
|
"""
|
||||||
|
if s1 == s2:
|
||||||
|
return 1.0
|
||||||
|
len1, len2 = len(s1), len(s2)
|
||||||
|
if len1 == 0 or len2 == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Optimierung: Bei sehr unterschiedlicher Länge ist Ähnlichkeit gering
|
||||||
|
if abs(len1 - len2) > max(len1, len2) * 0.5:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Levenshtein-Distanz berechnen (dynamische Programmierung)
|
||||||
|
if len1 > len2:
|
||||||
|
s1, s2, len1, len2 = s2, s1, len2, len1
|
||||||
|
|
||||||
|
current_row = range(len1 + 1)
|
||||||
|
for i in range(1, len2 + 1):
|
||||||
|
previous_row, current_row = current_row, [i] + [0] * len1
|
||||||
|
for j in range(1, len1 + 1):
|
||||||
|
add, delete, change = previous_row[j] + 1, current_row[j-1] + 1, previous_row[j-1]
|
||||||
|
if s1[j-1] != s2[i-1]:
|
||||||
|
change += 1
|
||||||
|
current_row[j] = min(add, delete, change)
|
||||||
|
|
||||||
|
distance = current_row[len1]
|
||||||
|
max_len = max(len1, len2)
|
||||||
|
return 1.0 - (distance / max_len)
|
||||||
|
|
||||||
|
|
||||||
|
def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
|
||||||
|
1. Vorfiltern nach Präfix (erstes Pfad-Segment)
|
||||||
|
2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: Dict mit {url: count}
|
||||||
|
similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit {repräsentative_url: summierter_count}
|
||||||
|
"""
|
||||||
|
if not urls:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Schritt 1: Nach erstem Pfad-Segment gruppieren
|
||||||
|
prefix_groups = {}
|
||||||
|
for url, count in urls.items():
|
||||||
|
# Präfix extrahieren (erstes Segment oder bis zum ersten ?)
|
||||||
|
path = url.split('?')[0] if '?' in url else url
|
||||||
|
segments = path.strip('/').split('/')
|
||||||
|
prefix = segments[0][:20] if segments and segments[0] else '/'
|
||||||
|
|
||||||
|
if prefix not in prefix_groups:
|
||||||
|
prefix_groups[prefix] = []
|
||||||
|
prefix_groups[prefix].append((url, count))
|
||||||
|
|
||||||
|
# Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for prefix, url_list in prefix_groups.items():
|
||||||
|
# Sortieren nach Count (häufigste zuerst)
|
||||||
|
url_list.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
# Für kleine Gruppen: Levenshtein-Gruppierung
|
||||||
|
if len(url_list) <= 100:
|
||||||
|
clusters = [] # Liste von (repräsentant, total_count, [urls])
|
||||||
|
|
||||||
|
for url, count in url_list:
|
||||||
|
# Versuche, zu einem bestehenden Cluster hinzuzufügen
|
||||||
|
added = False
|
||||||
|
for cluster in clusters:
|
||||||
|
rep_url = cluster[0]
|
||||||
|
similarity = levenshtein_similarity(url, rep_url)
|
||||||
|
if similarity >= similarity_threshold:
|
||||||
|
cluster[1] += count
|
||||||
|
cluster[2].append(url)
|
||||||
|
added = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not added:
|
||||||
|
# Neuer Cluster
|
||||||
|
clusters.append([url, count, [url]])
|
||||||
|
|
||||||
|
# Cluster-Repräsentanten als Ergebnis
|
||||||
|
for rep_url, total_count, members in clusters:
|
||||||
|
# Wenn mehrere URLs zusammengefasst wurden, markiere mit *
|
||||||
|
if len(members) > 1:
|
||||||
|
display_url = rep_url.split('?')[0] # Query-Parameter entfernen
|
||||||
|
if len(display_url) > 50:
|
||||||
|
display_url = display_url[:47] + '...'
|
||||||
|
display_url += f' ({len(members)}x)'
|
||||||
|
else:
|
||||||
|
display_url = rep_url
|
||||||
|
if len(display_url) > 60:
|
||||||
|
display_url = display_url[:57] + '...'
|
||||||
|
result[display_url] = total_count
|
||||||
|
else:
|
||||||
|
# Für große Gruppen: Einfache Präfix-Gruppierung
|
||||||
|
total = sum(c for _, c in url_list)
|
||||||
|
display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
|
||||||
|
result[display_prefix] = total
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Sammelt Statistiken aus dem Shop-Log (v2.5).
|
Sammelt Statistiken aus dem Shop-Log (v2.5).
|
||||||
@@ -3165,8 +3276,9 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
|||||||
'unique_bots': 0,
|
'unique_bots': 0,
|
||||||
'unique_countries': 0,
|
'unique_countries': 0,
|
||||||
'top_bots': {},
|
'top_bots': {},
|
||||||
'top_ips': {},
|
'top_ips': [],
|
||||||
'top_countries': {},
|
'top_countries': {},
|
||||||
|
'top_requests': {},
|
||||||
'human_requests': 0,
|
'human_requests': 0,
|
||||||
'bot_requests': 0
|
'bot_requests': 0
|
||||||
}
|
}
|
||||||
@@ -3174,6 +3286,7 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
|||||||
ips = {}
|
ips = {}
|
||||||
bots = {}
|
bots = {}
|
||||||
countries = {}
|
countries = {}
|
||||||
|
uris = {}
|
||||||
|
|
||||||
# Log-Datei auswerten
|
# Log-Datei auswerten
|
||||||
if os.path.isfile(log_file):
|
if os.path.isfile(log_file):
|
||||||
@@ -3235,6 +3348,17 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# URI extrahieren
|
||||||
|
uri = None
|
||||||
|
if 'URI: ' in line:
|
||||||
|
try:
|
||||||
|
uri = line.split('URI: ')[1].split(' |')[0].strip()
|
||||||
|
# Leere URIs ignorieren
|
||||||
|
if uri and uri != '/':
|
||||||
|
uris[uri] = uris.get(uri, 0) + 1
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Statistiken sammeln
|
# Statistiken sammeln
|
||||||
if ip:
|
if ip:
|
||||||
ips[ip] = ips.get(ip, 0) + 1
|
ips[ip] = ips.get(ip, 0) + 1
|
||||||
@@ -3293,18 +3417,34 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
|||||||
stats['unique_bots'] = len(bots)
|
stats['unique_bots'] = len(bots)
|
||||||
stats['unique_countries'] = len(countries)
|
stats['unique_countries'] = len(countries)
|
||||||
|
|
||||||
# Top Bots (max 10)
|
# Top Bots (max 20)
|
||||||
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:10]
|
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||||
stats['top_bots'] = dict(sorted_bots)
|
stats['top_bots'] = dict(sorted_bots)
|
||||||
|
|
||||||
# Top IPs (max 10)
|
# Top IPs (max 20) - mit Country-Info
|
||||||
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:10]
|
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||||
stats['top_ips'] = dict(sorted_ips)
|
top_ips_list = []
|
||||||
|
for ip, count in sorted_ips:
|
||||||
|
ip_info = get_ip_info(ip)
|
||||||
|
top_ips_list.append({
|
||||||
|
'ip': ip,
|
||||||
|
'count': count,
|
||||||
|
'country': ip_info.get('countryCode', 'XX'),
|
||||||
|
'org': ip_info.get('org', '') or ip_info.get('isp', ''),
|
||||||
|
'asn': ip_info.get('as', '')
|
||||||
|
})
|
||||||
|
stats['top_ips'] = top_ips_list
|
||||||
|
|
||||||
# Top Countries (max 10)
|
# Top Countries (max 20)
|
||||||
sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]
|
sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||||
stats['top_countries'] = dict(sorted_countries)
|
stats['top_countries'] = dict(sorted_countries)
|
||||||
|
|
||||||
|
# Top Requests (max 20) - mit Ähnlichkeits-Gruppierung
|
||||||
|
if uris:
|
||||||
|
grouped_uris = group_similar_urls(uris, similarity_threshold=0.85)
|
||||||
|
sorted_uris = sorted(grouped_uris.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||||
|
stats['top_requests'] = dict(sorted_uris)
|
||||||
|
|
||||||
# Req/min berechnen
|
# Req/min berechnen
|
||||||
activation_time = get_shop_activation_time(shop)
|
activation_time = get_shop_activation_time(shop)
|
||||||
if activation_time and stats['log_entries'] > 0:
|
if activation_time and stats['log_entries'] > 0:
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user