310 lines
11 KiB
Python
310 lines
11 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import unicodedata
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist')
|
|
VARIOUS_ARTISTS = 'Various Artists'
|
|
MAIN_ARTIST_THRESHOLD = 0.7
|
|
ARTIST_SPLIT_PATTERN = re.compile(
|
|
r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*',
|
|
re.IGNORECASE
|
|
)
|
|
COMPILATION_KEYWORDS = (
|
|
'top',
|
|
'hits',
|
|
'best',
|
|
'精选',
|
|
'ost',
|
|
'original soundtrack',
|
|
'soundtrack',
|
|
'原声带'
|
|
)
|
|
|
|
|
|
def merge_metadata_layers(
|
|
raw_metadata: dict[str, Any] | None,
|
|
matched_metadata: dict[str, Any] | None,
|
|
metadata_patch: dict[str, Any] | None = None
|
|
) -> dict[str, Any]:
|
|
merged = dict(raw_metadata or {})
|
|
merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')})
|
|
merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None})
|
|
return merged
|
|
|
|
|
|
def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]:
|
|
normalized = dict(metadata or {})
|
|
for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'):
|
|
if key in normalized and normalized[key] is None:
|
|
normalized[key] = ''
|
|
if 'compilation' in normalized and normalized['compilation'] in ('', None):
|
|
normalized['compilation'] = 0
|
|
return normalized
|
|
|
|
|
|
def can_ingest_metadata(metadata: dict[str, Any]) -> bool:
|
|
return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS)
|
|
|
|
|
|
class MetadataNormalizationService:
|
|
def __init__(self, task_store):
|
|
self.task_store = task_store
|
|
|
|
def create_cache(self) -> dict[str, dict[Any, Any]]:
|
|
return {
|
|
'task_items': {},
|
|
'group_entries': {},
|
|
'group_analysis': {}
|
|
}
|
|
|
|
def normalize_item(
|
|
self,
|
|
item: dict,
|
|
metadata_patch: dict[str, Any] | None = None,
|
|
cache: dict[str, dict[Any, Any]] | None = None
|
|
) -> dict[str, Any]:
|
|
merged = merge_metadata_layers(
|
|
item.get('original_tags_json'),
|
|
item.get('matched_metadata_json'),
|
|
metadata_patch
|
|
)
|
|
group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache)
|
|
return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key)
|
|
|
|
def _build_group_entries(
|
|
self,
|
|
item: dict,
|
|
metadata_patch: dict[str, Any] | None,
|
|
cache: dict[str, dict[Any, Any]] | None
|
|
) -> tuple[Any, list[dict[str, Any]]]:
|
|
current_merged = merge_metadata_layers(
|
|
item.get('original_tags_json'),
|
|
item.get('matched_metadata_json'),
|
|
metadata_patch
|
|
)
|
|
task_items = self._get_task_items(item['task_id'], cache)
|
|
current_group_key = self._group_key(current_merged, item)
|
|
cache_key = self._group_cache_key(item, current_group_key, metadata_patch)
|
|
if cache is not None and cache_key in cache['group_entries']:
|
|
return cache_key, cache['group_entries'][cache_key]
|
|
entries: list[dict[str, Any]] = []
|
|
|
|
for candidate in task_items:
|
|
candidate_patch = metadata_patch if candidate['id'] == item['id'] else None
|
|
merged = merge_metadata_layers(
|
|
candidate.get('original_tags_json'),
|
|
candidate.get('matched_metadata_json'),
|
|
candidate_patch
|
|
)
|
|
if self._group_key(merged, candidate) != current_group_key:
|
|
continue
|
|
entries.append(
|
|
{
|
|
'item_id': candidate['id'],
|
|
'metadata': merged,
|
|
'artist_info': parse_artist_string(merged.get('artist'))
|
|
}
|
|
)
|
|
|
|
if not entries:
|
|
entries = [
|
|
{
|
|
'item_id': item['id'],
|
|
'metadata': current_merged,
|
|
'artist_info': parse_artist_string(current_merged.get('artist'))
|
|
}
|
|
]
|
|
|
|
if cache is not None:
|
|
cache['group_entries'][cache_key] = entries
|
|
|
|
return cache_key, entries
|
|
|
|
def _get_task_items(
|
|
self,
|
|
task_id: str,
|
|
cache: dict[str, dict[Any, Any]] | None
|
|
) -> list[dict[str, Any]]:
|
|
if cache is None:
|
|
return self.task_store.list_all_task_items(task_id, active_only=True)
|
|
|
|
task_items = cache['task_items'].get(task_id)
|
|
if task_items is None:
|
|
task_items = self.task_store.list_all_task_items(task_id, active_only=True)
|
|
cache['task_items'][task_id] = task_items
|
|
return task_items
|
|
|
|
def _group_cache_key(
|
|
self,
|
|
item: dict,
|
|
current_group_key: tuple[str, str],
|
|
metadata_patch: dict[str, Any] | None
|
|
) -> tuple[Any, ...]:
|
|
patch_key = self._metadata_patch_cache_key(metadata_patch)
|
|
if patch_key is None:
|
|
return (item['task_id'], current_group_key)
|
|
return (item['task_id'], current_group_key, item['id'], patch_key)
|
|
|
|
def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None:
|
|
if not metadata_patch:
|
|
return None
|
|
return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items()))
|
|
|
|
def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]:
|
|
for key in ('release_id', 'release_group_id'):
|
|
value = _clean_token(metadata.get(key))
|
|
if value:
|
|
return (key, value)
|
|
|
|
album = _clean_token(metadata.get('album'))
|
|
if album:
|
|
return ('album', album)
|
|
|
|
parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix()
|
|
if parent_dir and parent_dir != '.':
|
|
return ('path', _clean_token(parent_dir))
|
|
|
|
return ('item', str(item['id']))
|
|
|
|
def _normalize_merged_metadata(
|
|
self,
|
|
merged: dict[str, Any],
|
|
group_entries: list[dict[str, Any]],
|
|
cache: dict[str, dict[Any, Any]] | None,
|
|
group_cache_key: Any
|
|
) -> dict[str, Any]:
|
|
normalized = dict(merged)
|
|
artist_value = str(normalized.get('artist') or '').strip()
|
|
album_artist_value = str(normalized.get('album_artist') or '').strip()
|
|
album_value = str(normalized.get('album') or '').strip()
|
|
artist_info = parse_artist_string(artist_value)
|
|
|
|
if album_artist_value:
|
|
normalized['album_artist'] = album_artist_value
|
|
normalized['normalization_strategy'] = 'source_preserved'
|
|
normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家'
|
|
normalized['artist_tokens'] = artist_info['tokens']
|
|
normalized['display_artist'] = artist_info['display_artist']
|
|
normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0
|
|
return normalize_metadata_shape(normalized)
|
|
|
|
group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key)
|
|
dominant_artist = group_analysis['dominant_artist']
|
|
dominant_ratio = group_analysis['dominant_ratio']
|
|
unique_main_artists = group_analysis['unique_main_artists']
|
|
has_collaboration_markup = group_analysis['has_collaboration_markup']
|
|
compilation_keyword_hit = _has_compilation_keyword(album_value)
|
|
|
|
strategy = 'unresolved'
|
|
album_artist = ''
|
|
reason = '无法从当前专辑分组推导专辑艺术家'
|
|
compilation = 0
|
|
|
|
if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup:
|
|
strategy = 'single_artist'
|
|
album_artist = dominant_artist
|
|
reason = '同专辑曲目的主艺人一致,按单艺人专辑处理'
|
|
elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup:
|
|
strategy = 'main_artist_feat'
|
|
album_artist = dominant_artist
|
|
reason = '同专辑主艺人一致,但存在 feat/合作曲目,按主艺人专辑处理'
|
|
elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD:
|
|
strategy = 'main_artist_feat'
|
|
album_artist = dominant_artist
|
|
reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}'
|
|
elif len(unique_main_artists) > 1 and compilation_keyword_hit:
|
|
strategy = 'compilation'
|
|
album_artist = VARIOUS_ARTISTS
|
|
reason = '多艺人分散且专辑名命中合辑/原声带关键词'
|
|
compilation = 1
|
|
elif dominant_artist:
|
|
strategy = 'dominant_artist_fallback'
|
|
album_artist = dominant_artist
|
|
reason = f'未命中合辑规则,回退到出现频次最高的主艺人 {dominant_artist}'
|
|
elif artist_value:
|
|
strategy = 'single_track_fallback'
|
|
album_artist = artist_info['primary'] or artist_value
|
|
reason = '仅有当前曲目可用,回退到当前曲目艺人'
|
|
|
|
normalized['album_artist'] = album_artist
|
|
normalized['compilation'] = compilation
|
|
normalized['normalization_strategy'] = strategy
|
|
normalized['album_artist_reason'] = reason
|
|
normalized['artist_tokens'] = artist_info['tokens']
|
|
normalized['display_artist'] = artist_info['display_artist']
|
|
return normalize_metadata_shape(normalized)
|
|
|
|
def _analyze_group_entries(
|
|
self,
|
|
group_entries: list[dict[str, Any]],
|
|
cache: dict[str, dict[Any, Any]] | None,
|
|
group_cache_key: Any
|
|
) -> dict[str, Any]:
|
|
if cache is not None and group_cache_key in cache['group_analysis']:
|
|
return cache['group_analysis'][group_cache_key]
|
|
|
|
main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']]
|
|
has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries)
|
|
main_artist_counts = Counter(main_artists)
|
|
unique_main_artists = set(main_artists)
|
|
dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0)
|
|
analysis = {
|
|
'dominant_artist': dominant_artist,
|
|
'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0,
|
|
'unique_main_artists': unique_main_artists,
|
|
'has_collaboration_markup': has_collaboration_markup
|
|
}
|
|
if cache is not None:
|
|
cache['group_analysis'][group_cache_key] = analysis
|
|
return analysis
|
|
|
|
|
|
def parse_artist_string(value: Any) -> dict[str, Any]:
|
|
display_artist = str(value or '').strip()
|
|
if not display_artist:
|
|
return {'display_artist': '', 'tokens': [], 'primary': ''}
|
|
|
|
normalized = unicodedata.normalize('NFKC', display_artist)
|
|
tokens = [
|
|
_normalize_artist_token(token)
|
|
for token in ARTIST_SPLIT_PATTERN.split(normalized)
|
|
if _normalize_artist_token(token)
|
|
]
|
|
if not tokens:
|
|
tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else []
|
|
return {
|
|
'display_artist': display_artist,
|
|
'tokens': tokens,
|
|
'primary': tokens[0] if tokens else ''
|
|
}
|
|
|
|
|
|
def _normalize_artist_token(value: str) -> str:
|
|
cleaned = unicodedata.normalize('NFKC', str(value or '')).strip()
|
|
cleaned = re.sub(r'\s+', ' ', cleaned)
|
|
return cleaned
|
|
|
|
|
|
def _clean_token(value: Any) -> str:
|
|
cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower()
|
|
cleaned = re.sub(r'\s+', ' ', cleaned)
|
|
return cleaned
|
|
|
|
|
|
def _has_compilation_keyword(album: str) -> bool:
|
|
normalized = _clean_token(album)
|
|
return any(keyword in normalized for keyword in COMPILATION_KEYWORDS)
|
|
|
|
|
|
def _truthy_compilation(value: Any) -> bool:
|
|
if isinstance(value, bool):
|
|
return value
|
|
if isinstance(value, (int, float)):
|
|
return value != 0
|
|
return str(value or '').strip().lower() in {'1', 'true', 'yes'}
|