perf optimization
This commit is contained in:
@@ -3173,15 +3173,14 @@ def levenshtein_similarity(s1: str, s2: str) -> float:
|
||||
return 1.0 - (distance / max_len)
|
||||
|
||||
|
||||
def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85) -> Dict[str, int]:
|
||||
def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.80) -> Dict[str, int]:
|
||||
"""
|
||||
Gruppiert ähnliche URLs basierend auf Hybrid-Ansatz:
|
||||
1. Vorfiltern nach Präfix (erstes Pfad-Segment)
|
||||
2. Innerhalb der Gruppe: Levenshtein für kleine Gruppen
|
||||
Gruppiert ähnliche URLs basierend auf schnellem Präfix-Ansatz.
|
||||
Optimiert für Performance - keine Levenshtein mehr wegen CPU-Last.
|
||||
|
||||
Args:
|
||||
urls: Dict mit {url: count}
|
||||
similarity_threshold: Mindest-Ähnlichkeit für Gruppierung (0.0-1.0)
|
||||
similarity_threshold: Nicht mehr verwendet (Kompatibilität)
|
||||
|
||||
Returns:
|
||||
Dict mit {repräsentative_url: summierter_count}
|
||||
@@ -3189,63 +3188,41 @@ def group_similar_urls(urls: Dict[str, int], similarity_threshold: float = 0.85)
|
||||
if not urls:
|
||||
return {}
|
||||
|
||||
# Schritt 1: Nach erstem Pfad-Segment gruppieren
|
||||
# Nur Top 500 URLs verarbeiten (Performance)
|
||||
sorted_urls = sorted(urls.items(), key=lambda x: x[1], reverse=True)[:500]
|
||||
|
||||
# Nach normalisiertem Pfad gruppieren (Query-Params entfernen, auf 50 Zeichen kürzen)
|
||||
prefix_groups = {}
|
||||
for url, count in urls.items():
|
||||
# Präfix extrahieren (erstes Segment oder bis zum ersten ?)
|
||||
for url, count in sorted_urls:
|
||||
# URL normalisieren: Query-Parameter entfernen
|
||||
path = url.split('?')[0] if '?' in url else url
|
||||
segments = path.strip('/').split('/')
|
||||
prefix = segments[0][:20] if segments and segments[0] else '/'
|
||||
# Auf erste 50 Zeichen kürzen für Gruppierung
|
||||
normalized = path[:50]
|
||||
|
||||
if prefix not in prefix_groups:
|
||||
prefix_groups[prefix] = []
|
||||
prefix_groups[prefix].append((url, count))
|
||||
if normalized not in prefix_groups:
|
||||
prefix_groups[normalized] = {'urls': [], 'total': 0}
|
||||
prefix_groups[normalized]['urls'].append(url)
|
||||
prefix_groups[normalized]['total'] += count
|
||||
|
||||
# Schritt 2: Innerhalb jeder Präfix-Gruppe ähnliche URLs zusammenfassen
|
||||
# Ergebnis erstellen
|
||||
result = {}
|
||||
for normalized, data in prefix_groups.items():
|
||||
url_count = len(data['urls'])
|
||||
total = data['total']
|
||||
|
||||
for prefix, url_list in prefix_groups.items():
|
||||
# Sortieren nach Count (häufigste zuerst)
|
||||
url_list.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Für kleine Gruppen: Levenshtein-Gruppierung
|
||||
if len(url_list) <= 100:
|
||||
clusters = [] # Liste von (repräsentant, total_count, [urls])
|
||||
|
||||
for url, count in url_list:
|
||||
# Versuche, zu einem bestehenden Cluster hinzuzufügen
|
||||
added = False
|
||||
for cluster in clusters:
|
||||
rep_url = cluster[0]
|
||||
similarity = levenshtein_similarity(url, rep_url)
|
||||
if similarity >= similarity_threshold:
|
||||
cluster[1] += count
|
||||
cluster[2].append(url)
|
||||
added = True
|
||||
break
|
||||
|
||||
if not added:
|
||||
# Neuer Cluster
|
||||
clusters.append([url, count, [url]])
|
||||
|
||||
# Cluster-Repräsentanten als Ergebnis
|
||||
for rep_url, total_count, members in clusters:
|
||||
# Wenn mehrere URLs zusammengefasst wurden, markiere mit *
|
||||
if len(members) > 1:
|
||||
display_url = rep_url.split('?')[0] # Query-Parameter entfernen
|
||||
if len(display_url) > 50:
|
||||
display_url = display_url[:47] + '...'
|
||||
display_url += f' ({len(members)}x)'
|
||||
else:
|
||||
display_url = rep_url
|
||||
if len(display_url) > 60:
|
||||
display_url = display_url[:57] + '...'
|
||||
result[display_url] = total_count
|
||||
if url_count > 1:
|
||||
# Mehrere ähnliche URLs
|
||||
display_url = normalized
|
||||
if len(display_url) > 47:
|
||||
display_url = display_url[:44] + '...'
|
||||
display_url += f' ({url_count}x)'
|
||||
else:
|
||||
# Für große Gruppen: Einfache Präfix-Gruppierung
|
||||
total = sum(c for _, c in url_list)
|
||||
display_prefix = f'/{prefix}/* ({len(url_list)} URLs)'
|
||||
result[display_prefix] = total
|
||||
# Einzelne URL
|
||||
display_url = data['urls'][0]
|
||||
if len(display_url) > 60:
|
||||
display_url = display_url[:57] + '...'
|
||||
|
||||
result[display_url] = total
|
||||
|
||||
return result
|
||||
|
||||
@@ -3421,17 +3398,18 @@ def get_shop_log_stats(shop: str) -> Dict[str, Any]:
|
||||
sorted_bots = sorted(bots.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||
stats['top_bots'] = dict(sorted_bots)
|
||||
|
||||
# Top IPs (max 20) - mit Country-Info
|
||||
# Top IPs (max 20) - mit Country-Info (nur aus Cache, keine API-Aufrufe!)
|
||||
sorted_ips = sorted(ips.items(), key=lambda x: x[1], reverse=True)[:20]
|
||||
top_ips_list = []
|
||||
for ip, count in sorted_ips:
|
||||
ip_info = get_ip_info(ip)
|
||||
# Nur schnellen Country-Lookup aus lokalem Cache, keine externen API-Aufrufe
|
||||
country_code = get_country_for_ip_cached(ip)
|
||||
top_ips_list.append({
|
||||
'ip': ip,
|
||||
'count': count,
|
||||
'country': ip_info.get('countryCode', 'XX'),
|
||||
'org': ip_info.get('org', '') or ip_info.get('isp', ''),
|
||||
'asn': ip_info.get('as', '')
|
||||
'country': country_code if country_code else 'XX',
|
||||
'org': '',
|
||||
'asn': ''
|
||||
})
|
||||
stats['top_ips'] = top_ips_list
|
||||
|
||||
|
||||
Reference in New Issue
Block a user