Add MusicWorkshop application
This commit is contained in:
@@ -0,0 +1,309 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist')
|
||||
VARIOUS_ARTISTS = 'Various Artists'
|
||||
MAIN_ARTIST_THRESHOLD = 0.7
|
||||
ARTIST_SPLIT_PATTERN = re.compile(
|
||||
r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*',
|
||||
re.IGNORECASE
|
||||
)
|
||||
COMPILATION_KEYWORDS = (
|
||||
'top',
|
||||
'hits',
|
||||
'best',
|
||||
'精选',
|
||||
'ost',
|
||||
'original soundtrack',
|
||||
'soundtrack',
|
||||
'原声带'
|
||||
)
|
||||
|
||||
|
||||
def merge_metadata_layers(
|
||||
raw_metadata: dict[str, Any] | None,
|
||||
matched_metadata: dict[str, Any] | None,
|
||||
metadata_patch: dict[str, Any] | None = None
|
||||
) -> dict[str, Any]:
|
||||
merged = dict(raw_metadata or {})
|
||||
merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')})
|
||||
merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None})
|
||||
return merged
|
||||
|
||||
|
||||
def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]:
|
||||
normalized = dict(metadata or {})
|
||||
for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'):
|
||||
if key in normalized and normalized[key] is None:
|
||||
normalized[key] = ''
|
||||
if 'compilation' in normalized and normalized['compilation'] in ('', None):
|
||||
normalized['compilation'] = 0
|
||||
return normalized
|
||||
|
||||
|
||||
def can_ingest_metadata(metadata: dict[str, Any]) -> bool:
|
||||
return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS)
|
||||
|
||||
|
||||
class MetadataNormalizationService:
|
||||
def __init__(self, task_store):
|
||||
self.task_store = task_store
|
||||
|
||||
def create_cache(self) -> dict[str, dict[Any, Any]]:
|
||||
return {
|
||||
'task_items': {},
|
||||
'group_entries': {},
|
||||
'group_analysis': {}
|
||||
}
|
||||
|
||||
def normalize_item(
|
||||
self,
|
||||
item: dict,
|
||||
metadata_patch: dict[str, Any] | None = None,
|
||||
cache: dict[str, dict[Any, Any]] | None = None
|
||||
) -> dict[str, Any]:
|
||||
merged = merge_metadata_layers(
|
||||
item.get('original_tags_json'),
|
||||
item.get('matched_metadata_json'),
|
||||
metadata_patch
|
||||
)
|
||||
group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache)
|
||||
return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key)
|
||||
|
||||
def _build_group_entries(
|
||||
self,
|
||||
item: dict,
|
||||
metadata_patch: dict[str, Any] | None,
|
||||
cache: dict[str, dict[Any, Any]] | None
|
||||
) -> tuple[Any, list[dict[str, Any]]]:
|
||||
current_merged = merge_metadata_layers(
|
||||
item.get('original_tags_json'),
|
||||
item.get('matched_metadata_json'),
|
||||
metadata_patch
|
||||
)
|
||||
task_items = self._get_task_items(item['task_id'], cache)
|
||||
current_group_key = self._group_key(current_merged, item)
|
||||
cache_key = self._group_cache_key(item, current_group_key, metadata_patch)
|
||||
if cache is not None and cache_key in cache['group_entries']:
|
||||
return cache_key, cache['group_entries'][cache_key]
|
||||
entries: list[dict[str, Any]] = []
|
||||
|
||||
for candidate in task_items:
|
||||
candidate_patch = metadata_patch if candidate['id'] == item['id'] else None
|
||||
merged = merge_metadata_layers(
|
||||
candidate.get('original_tags_json'),
|
||||
candidate.get('matched_metadata_json'),
|
||||
candidate_patch
|
||||
)
|
||||
if self._group_key(merged, candidate) != current_group_key:
|
||||
continue
|
||||
entries.append(
|
||||
{
|
||||
'item_id': candidate['id'],
|
||||
'metadata': merged,
|
||||
'artist_info': parse_artist_string(merged.get('artist'))
|
||||
}
|
||||
)
|
||||
|
||||
if not entries:
|
||||
entries = [
|
||||
{
|
||||
'item_id': item['id'],
|
||||
'metadata': current_merged,
|
||||
'artist_info': parse_artist_string(current_merged.get('artist'))
|
||||
}
|
||||
]
|
||||
|
||||
if cache is not None:
|
||||
cache['group_entries'][cache_key] = entries
|
||||
|
||||
return cache_key, entries
|
||||
|
||||
def _get_task_items(
|
||||
self,
|
||||
task_id: str,
|
||||
cache: dict[str, dict[Any, Any]] | None
|
||||
) -> list[dict[str, Any]]:
|
||||
if cache is None:
|
||||
return self.task_store.list_all_task_items(task_id, active_only=True)
|
||||
|
||||
task_items = cache['task_items'].get(task_id)
|
||||
if task_items is None:
|
||||
task_items = self.task_store.list_all_task_items(task_id, active_only=True)
|
||||
cache['task_items'][task_id] = task_items
|
||||
return task_items
|
||||
|
||||
def _group_cache_key(
|
||||
self,
|
||||
item: dict,
|
||||
current_group_key: tuple[str, str],
|
||||
metadata_patch: dict[str, Any] | None
|
||||
) -> tuple[Any, ...]:
|
||||
patch_key = self._metadata_patch_cache_key(metadata_patch)
|
||||
if patch_key is None:
|
||||
return (item['task_id'], current_group_key)
|
||||
return (item['task_id'], current_group_key, item['id'], patch_key)
|
||||
|
||||
def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None:
|
||||
if not metadata_patch:
|
||||
return None
|
||||
return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items()))
|
||||
|
||||
def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]:
|
||||
for key in ('release_id', 'release_group_id'):
|
||||
value = _clean_token(metadata.get(key))
|
||||
if value:
|
||||
return (key, value)
|
||||
|
||||
album = _clean_token(metadata.get('album'))
|
||||
if album:
|
||||
return ('album', album)
|
||||
|
||||
parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix()
|
||||
if parent_dir and parent_dir != '.':
|
||||
return ('path', _clean_token(parent_dir))
|
||||
|
||||
return ('item', str(item['id']))
|
||||
|
||||
def _normalize_merged_metadata(
|
||||
self,
|
||||
merged: dict[str, Any],
|
||||
group_entries: list[dict[str, Any]],
|
||||
cache: dict[str, dict[Any, Any]] | None,
|
||||
group_cache_key: Any
|
||||
) -> dict[str, Any]:
|
||||
normalized = dict(merged)
|
||||
artist_value = str(normalized.get('artist') or '').strip()
|
||||
album_artist_value = str(normalized.get('album_artist') or '').strip()
|
||||
album_value = str(normalized.get('album') or '').strip()
|
||||
artist_info = parse_artist_string(artist_value)
|
||||
|
||||
if album_artist_value:
|
||||
normalized['album_artist'] = album_artist_value
|
||||
normalized['normalization_strategy'] = 'source_preserved'
|
||||
normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家'
|
||||
normalized['artist_tokens'] = artist_info['tokens']
|
||||
normalized['display_artist'] = artist_info['display_artist']
|
||||
normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0
|
||||
return normalize_metadata_shape(normalized)
|
||||
|
||||
group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key)
|
||||
dominant_artist = group_analysis['dominant_artist']
|
||||
dominant_ratio = group_analysis['dominant_ratio']
|
||||
unique_main_artists = group_analysis['unique_main_artists']
|
||||
has_collaboration_markup = group_analysis['has_collaboration_markup']
|
||||
compilation_keyword_hit = _has_compilation_keyword(album_value)
|
||||
|
||||
strategy = 'unresolved'
|
||||
album_artist = ''
|
||||
reason = '无法从当前专辑分组推导专辑艺术家'
|
||||
compilation = 0
|
||||
|
||||
if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup:
|
||||
strategy = 'single_artist'
|
||||
album_artist = dominant_artist
|
||||
reason = '同专辑曲目的主艺人一致,按单艺人专辑处理'
|
||||
elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup:
|
||||
strategy = 'main_artist_feat'
|
||||
album_artist = dominant_artist
|
||||
reason = '同专辑主艺人一致,但存在 feat/合作曲目,按主艺人专辑处理'
|
||||
elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD:
|
||||
strategy = 'main_artist_feat'
|
||||
album_artist = dominant_artist
|
||||
reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}'
|
||||
elif len(unique_main_artists) > 1 and compilation_keyword_hit:
|
||||
strategy = 'compilation'
|
||||
album_artist = VARIOUS_ARTISTS
|
||||
reason = '多艺人分散且专辑名命中合辑/原声带关键词'
|
||||
compilation = 1
|
||||
elif dominant_artist:
|
||||
strategy = 'dominant_artist_fallback'
|
||||
album_artist = dominant_artist
|
||||
reason = f'未命中合辑规则,回退到出现频次最高的主艺人 {dominant_artist}'
|
||||
elif artist_value:
|
||||
strategy = 'single_track_fallback'
|
||||
album_artist = artist_info['primary'] or artist_value
|
||||
reason = '仅有当前曲目可用,回退到当前曲目艺人'
|
||||
|
||||
normalized['album_artist'] = album_artist
|
||||
normalized['compilation'] = compilation
|
||||
normalized['normalization_strategy'] = strategy
|
||||
normalized['album_artist_reason'] = reason
|
||||
normalized['artist_tokens'] = artist_info['tokens']
|
||||
normalized['display_artist'] = artist_info['display_artist']
|
||||
return normalize_metadata_shape(normalized)
|
||||
|
||||
def _analyze_group_entries(
|
||||
self,
|
||||
group_entries: list[dict[str, Any]],
|
||||
cache: dict[str, dict[Any, Any]] | None,
|
||||
group_cache_key: Any
|
||||
) -> dict[str, Any]:
|
||||
if cache is not None and group_cache_key in cache['group_analysis']:
|
||||
return cache['group_analysis'][group_cache_key]
|
||||
|
||||
main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']]
|
||||
has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries)
|
||||
main_artist_counts = Counter(main_artists)
|
||||
unique_main_artists = set(main_artists)
|
||||
dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0)
|
||||
analysis = {
|
||||
'dominant_artist': dominant_artist,
|
||||
'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0,
|
||||
'unique_main_artists': unique_main_artists,
|
||||
'has_collaboration_markup': has_collaboration_markup
|
||||
}
|
||||
if cache is not None:
|
||||
cache['group_analysis'][group_cache_key] = analysis
|
||||
return analysis
|
||||
|
||||
|
||||
def parse_artist_string(value: Any) -> dict[str, Any]:
|
||||
display_artist = str(value or '').strip()
|
||||
if not display_artist:
|
||||
return {'display_artist': '', 'tokens': [], 'primary': ''}
|
||||
|
||||
normalized = unicodedata.normalize('NFKC', display_artist)
|
||||
tokens = [
|
||||
_normalize_artist_token(token)
|
||||
for token in ARTIST_SPLIT_PATTERN.split(normalized)
|
||||
if _normalize_artist_token(token)
|
||||
]
|
||||
if not tokens:
|
||||
tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else []
|
||||
return {
|
||||
'display_artist': display_artist,
|
||||
'tokens': tokens,
|
||||
'primary': tokens[0] if tokens else ''
|
||||
}
|
||||
|
||||
|
||||
def _normalize_artist_token(value: str) -> str:
|
||||
cleaned = unicodedata.normalize('NFKC', str(value or '')).strip()
|
||||
cleaned = re.sub(r'\s+', ' ', cleaned)
|
||||
return cleaned
|
||||
|
||||
|
||||
def _clean_token(value: Any) -> str:
|
||||
cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower()
|
||||
cleaned = re.sub(r'\s+', ' ', cleaned)
|
||||
return cleaned
|
||||
|
||||
|
||||
def _has_compilation_keyword(album: str) -> bool:
|
||||
normalized = _clean_token(album)
|
||||
return any(keyword in normalized for keyword in COMPILATION_KEYWORDS)
|
||||
|
||||
|
||||
def _truthy_compilation(value: Any) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, (int, float)):
|
||||
return value != 0
|
||||
return str(value or '').strip().lower() in {'1', 'true', 'yes'}
|
||||
Reference in New Issue
Block a user