perf optimization
This commit is contained in:
@@ -3173,15 +3173,14 @@ def levenshtein_similarity(s1: str, s2: str) -> float:
|
|||||||
return 1.0 - (distance / max_len)
|
return 1.0 - (distance / max_len)
|
||||||
|
|
||||||
|
|
||||||
def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
|
def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.80) -> Dict[str, int]:
|
||||||
"""
|
"""
|
||||||
Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
|
Gruppiert ähnliche URLs basierend auf schnellem Präfix-Ansatz.
|
||||||
1. Vorfiltern nach Präfix (erstes Pfad-Segment)
|
Optimiert für Performance - keine Levenshtein mehr wegen CPU-Last.
|
||||||
2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: Dict mit {url: count}
|
urls: Dict mit {url: count}
|
||||||
similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
|
similarity_threshold: Nicht mehr verwendet (Kompatibilität)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict mit {repräsentative_url: summierter_count}
|
Dict mit {repräsentative_url: summierter_count}
|
||||||
@@ -3189,63 +3188,41 @@ def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85)
|
|||||||
if not urls:
|
if not urls:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Schritt 1: Nach erstem Pfad-Segment gruppieren
|
# Nur Top 500 URLs verarbeiten (Performance)
|
||||||
|
sorted_urls = sorted(urls.items(), key=lambda x: x[1], reverse=True)[:500]
|
||||||
|
|
||||||
|
# Nach normalisiertem Pfad gruppieren (Query-Params entfernen, auf 50 Zeichen kürzen)
|
||||||
prefix_groups = {}
|
prefix_groups = {}
|
||||||
for url, count in urls.items():
|
for url, count in sorted_urls:
|
||||||
# Präfix extrahieren (erstes Segment oder bis zum ersten ?)
|
# URL normalisieren: Query-Parameter entfernen
|
||||||
path = url.split('?')[0] if '?' in url else url
|
path = url.split('?')[0] if '?' in url else url
|
||||||
segments = path.strip('/').split('/')
|
# Auf erste 50 Zeichen kürzen für Gruppierung
|
||||||
prefix = segments[0][:20] if segments and segments[0] else '/'
|
normalized = path[:50]
|
||||||
|
|
||||||
if prefix not in prefix_groups:
|
if normalized not in prefix_groups:
|
||||||
prefix_groups[prefix] = []
|
prefix_groups[normalized] = {'urls': [], 'total': 0}
|
||||||
prefix_groups[prefix].append((url, count))
|
prefix_groups[normalized]['urls'].append(url)
|
||||||
|
prefix_groups[normalized]['total'] += count
|
||||||
|
|
||||||
# Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
|
# Ergebnis erstellen
|
||||||
result = {}
|
result = {}
|
||||||
|
for normalized, data in prefix_groups.items():
|
||||||
|
url_count = len(data['urls'])
|
||||||
|
total = data['total']
|
||||||
|
|
||||||
for prefix, url_list in prefix_groups.items():
|
if url_count > 1:
|
||||||
# Sortieren nach Count (häufigste zuerst)
|
# Mehrere ähnliche URLs
|
||||||
url_list.sort(key=lambda x: x[1], reverse=True)
|
display_url = normalized
|
||||||
|
if len(display_url) > 47:
|
||||||
# Für kleine Gruppen: Levenshtein-Gruppierung
|
display_url = display_url[:44] + '...'
|
||||||
if len(url_list) <= 100:
|
display_url += f' ({url_count}x)'
|
||||||
clusters = [] # Liste von (repräsentant, total_count, [urls])
|
|
||||||
|
|
||||||
for url, count in url_list:
|
|
||||||
# Versuche, zu einem bestehenden Cluster hinzuzufügen
|
|
||||||
added = False
|
|
||||||
for cluster in clusters:
|
|
||||||
rep_url = cluster[0]
|
|
||||||
similarity = levenshtein_similarity(url, rep_url)
|
|
||||||
if similarity >= similarity_threshold:
|
|
||||||
cluster[1] += count
|
|
||||||
cluster[2].append(url)
|
|
||||||
added = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not added:
|
|
||||||
# Neuer Cluster
|
|
||||||
clusters.append([url, count, [url]])
|
|
||||||
|
|
||||||
# Cluster-Repräsentanten als Ergebnis
|
|
||||||
for rep_url, total_count, members in clusters:
|
|
||||||
# Wenn mehrere URLs zusammengefasst wurden, markiere mit *
|
|
||||||
if len(members) > 1:
|
|
||||||
display_url = rep_url.split('?')[0] # Query-Parameter entfernen
|
|
||||||
if len(display_url) > 50:
|
|
||||||
display_url = display_url[:47] + '...'
|
|
||||||
display_url += f' ({len(members)}x)'
|
|
||||||
else:
|
|
||||||
display_url = rep_url
|
|
||||||
if len(display_url) > 60:
|
|
||||||
display_url = display_url[:57] + '...'
|
|
||||||
result[display_url] = total_count
|
|
||||||
else:
|
else:
|
||||||
# Für große Gruppen: Einfache Präfix-Gruppierung
|
# Einzelne URL
|
||||||
total = sum(c for _, c in url_list)
|
display_url = data['urls'][0]
|
||||||
display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
|
if len(display_url) > 60:
|
||||||
result[display_prefix] = total
|
display_url = display_url[:57] + '...'
|
||||||
|
|
||||||
|
result[display_url] = total
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -3421,17 +3398,18 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
|||||||
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
|
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||||
stats['top_bots'] = dict(sorted_bots)
|
stats['top_bots'] = dict(sorted_bots)
|
||||||
|
|
||||||
# Top IPs (max 20) - mit Country-Info
|
# Top IPs (max 20) - mit Country-Info (nur aus Cache, keine API-Aufrufe!)
|
||||||
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
|
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||||
top_ips_list = []
|
top_ips_list = []
|
||||||
for ip, count in sorted_ips:
|
for ip, count in sorted_ips:
|
||||||
ip_info = get_ip_info(ip)
|
# Nur schnellen Country-Lookup aus lokalem Cache, keine externen API-Aufrufe
|
||||||
|
country_code = get_country_for_ip_cached(ip)
|
||||||
top_ips_list.append({
|
top_ips_list.append({
|
||||||
'ip': ip,
|
'ip': ip,
|
||||||
'count': count,
|
'count': count,
|
||||||
'country': ip_info.get('countryCode', 'XX'),
|
'country': country_code if country_code else 'XX',
|
||||||
'org': ip_info.get('org', '') or ip_info.get('isp', ''),
|
'org': '',
|
||||||
'asn': ip_info.get('as', '')
|
'asn': ''
|
||||||
})
|
})
|
||||||
stats['top_ips'] = top_ips_list
|
stats['top_ips'] = top_ips_list
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user