from __future__ import annotations import re import unicodedata from collections import Counter from pathlib import Path from typing import Any INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist') VARIOUS_ARTISTS = 'Various Artists' MAIN_ARTIST_THRESHOLD = 0.7 ARTIST_SPLIT_PATTERN = re.compile( r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*', re.IGNORECASE ) COMPILATION_KEYWORDS = ( 'top', 'hits', 'best', '精选', 'ost', 'original soundtrack', 'soundtrack', '原声带' ) def merge_metadata_layers( raw_metadata: dict[str, Any] | None, matched_metadata: dict[str, Any] | None, metadata_patch: dict[str, Any] | None = None ) -> dict[str, Any]: merged = dict(raw_metadata or {}) merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')}) merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None}) return merged def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]: normalized = dict(metadata or {}) for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'): if key in normalized and normalized[key] is None: normalized[key] = '' if 'compilation' in normalized and normalized['compilation'] in ('', None): normalized['compilation'] = 0 return normalized def can_ingest_metadata(metadata: dict[str, Any]) -> bool: return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS) class MetadataNormalizationService: def __init__(self, task_store): self.task_store = task_store def create_cache(self) -> dict[str, dict[Any, Any]]: return { 'task_items': {}, 'group_entries': {}, 'group_analysis': {} } def normalize_item( self, item: dict, metadata_patch: dict[str, Any] | None = None, cache: dict[str, dict[Any, Any]] | None = None ) -> dict[str, Any]: merged = merge_metadata_layers( item.get('original_tags_json'), item.get('matched_metadata_json'), metadata_patch ) group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache) return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key) def _build_group_entries( self, item: dict, metadata_patch: dict[str, Any] | None, cache: dict[str, dict[Any, Any]] | None ) -> tuple[Any, list[dict[str, Any]]]: current_merged = merge_metadata_layers( item.get('original_tags_json'), item.get('matched_metadata_json'), metadata_patch ) task_items = self._get_task_items(item['task_id'], cache) current_group_key = self._group_key(current_merged, item) cache_key = self._group_cache_key(item, current_group_key, metadata_patch) if cache is not None and cache_key in cache['group_entries']: return cache_key, cache['group_entries'][cache_key] entries: list[dict[str, Any]] = [] for candidate in task_items: candidate_patch = metadata_patch if candidate['id'] == item['id'] else None merged = merge_metadata_layers( candidate.get('original_tags_json'), candidate.get('matched_metadata_json'), candidate_patch ) if self._group_key(merged, candidate) != current_group_key: continue entries.append( { 'item_id': candidate['id'], 'metadata': merged, 'artist_info': parse_artist_string(merged.get('artist')) } ) if not entries: entries = [ { 'item_id': item['id'], 'metadata': current_merged, 'artist_info': parse_artist_string(current_merged.get('artist')) } ] if cache is not None: cache['group_entries'][cache_key] = entries return cache_key, entries def _get_task_items( self, task_id: str, cache: dict[str, dict[Any, Any]] | None ) -> list[dict[str, Any]]: if cache is None: return self.task_store.list_all_task_items(task_id, active_only=True) task_items = cache['task_items'].get(task_id) if task_items is None: task_items = self.task_store.list_all_task_items(task_id, active_only=True) cache['task_items'][task_id] = task_items return task_items def _group_cache_key( self, item: dict, current_group_key: tuple[str, str], metadata_patch: dict[str, Any] | None ) -> tuple[Any, ...]: patch_key = self._metadata_patch_cache_key(metadata_patch) if patch_key is None: return (item['task_id'], current_group_key) return (item['task_id'], current_group_key, item['id'], patch_key) def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None: if not metadata_patch: return None return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items())) def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]: for key in ('release_id', 'release_group_id'): value = _clean_token(metadata.get(key)) if value: return (key, value) album = _clean_token(metadata.get('album')) if album: return ('album', album) parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix() if parent_dir and parent_dir != '.': return ('path', _clean_token(parent_dir)) return ('item', str(item['id'])) def _normalize_merged_metadata( self, merged: dict[str, Any], group_entries: list[dict[str, Any]], cache: dict[str, dict[Any, Any]] | None, group_cache_key: Any ) -> dict[str, Any]: normalized = dict(merged) artist_value = str(normalized.get('artist') or '').strip() album_artist_value = str(normalized.get('album_artist') or '').strip() album_value = str(normalized.get('album') or '').strip() artist_info = parse_artist_string(artist_value) if album_artist_value: normalized['album_artist'] = album_artist_value normalized['normalization_strategy'] = 'source_preserved' normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家' normalized['artist_tokens'] = artist_info['tokens'] normalized['display_artist'] = artist_info['display_artist'] normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0 return normalize_metadata_shape(normalized) group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key) dominant_artist = group_analysis['dominant_artist'] dominant_ratio = group_analysis['dominant_ratio'] unique_main_artists = group_analysis['unique_main_artists'] has_collaboration_markup = group_analysis['has_collaboration_markup'] compilation_keyword_hit = _has_compilation_keyword(album_value) strategy = 'unresolved' album_artist = '' reason = '无法从当前专辑分组推导专辑艺术家' compilation = 0 if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup: strategy = 'single_artist' album_artist = dominant_artist reason = '同专辑曲目的主艺人一致,按单艺人专辑处理' elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup: strategy = 'main_artist_feat' album_artist = dominant_artist reason = '同专辑主艺人一致,但存在 feat/合作曲目,按主艺人专辑处理' elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD: strategy = 'main_artist_feat' album_artist = dominant_artist reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}' elif len(unique_main_artists) > 1 and compilation_keyword_hit: strategy = 'compilation' album_artist = VARIOUS_ARTISTS reason = '多艺人分散且专辑名命中合辑/原声带关键词' compilation = 1 elif dominant_artist: strategy = 'dominant_artist_fallback' album_artist = dominant_artist reason = f'未命中合辑规则,回退到出现频次最高的主艺人 {dominant_artist}' elif artist_value: strategy = 'single_track_fallback' album_artist = artist_info['primary'] or artist_value reason = '仅有当前曲目可用,回退到当前曲目艺人' normalized['album_artist'] = album_artist normalized['compilation'] = compilation normalized['normalization_strategy'] = strategy normalized['album_artist_reason'] = reason normalized['artist_tokens'] = artist_info['tokens'] normalized['display_artist'] = artist_info['display_artist'] return normalize_metadata_shape(normalized) def _analyze_group_entries( self, group_entries: list[dict[str, Any]], cache: dict[str, dict[Any, Any]] | None, group_cache_key: Any ) -> dict[str, Any]: if cache is not None and group_cache_key in cache['group_analysis']: return cache['group_analysis'][group_cache_key] main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']] has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries) main_artist_counts = Counter(main_artists) unique_main_artists = set(main_artists) dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0) analysis = { 'dominant_artist': dominant_artist, 'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0, 'unique_main_artists': unique_main_artists, 'has_collaboration_markup': has_collaboration_markup } if cache is not None: cache['group_analysis'][group_cache_key] = analysis return analysis def parse_artist_string(value: Any) -> dict[str, Any]: display_artist = str(value or '').strip() if not display_artist: return {'display_artist': '', 'tokens': [], 'primary': ''} normalized = unicodedata.normalize('NFKC', display_artist) tokens = [ _normalize_artist_token(token) for token in ARTIST_SPLIT_PATTERN.split(normalized) if _normalize_artist_token(token) ] if not tokens: tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else [] return { 'display_artist': display_artist, 'tokens': tokens, 'primary': tokens[0] if tokens else '' } def _normalize_artist_token(value: str) -> str: cleaned = unicodedata.normalize('NFKC', str(value or '')).strip() cleaned = re.sub(r'\s+', ' ', cleaned) return cleaned def _clean_token(value: Any) -> str: cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower() cleaned = re.sub(r'\s+', ' ', cleaned) return cleaned def _has_compilation_keyword(album: str) -> bool: normalized = _clean_token(album) return any(keyword in normalized for keyword in COMPILATION_KEYWORDS) def _truthy_compilation(value: Any) -> bool: if isinstance(value, bool): return value if isinstance(value, (int, float)): return value != 0 return str(value or '').strip().lower() in {'1', 'true', 'yes'}