Add MusicWorkshop application

This commit is contained in:
liumangmang
2026-04-30 14:34:28 +08:00
parent 4cb403c956
commit 796f19990f
62 changed files with 21614 additions and 2168 deletions
+309
View File
@@ -0,0 +1,309 @@
from __future__ import annotations
import re
import unicodedata
from collections import Counter
from pathlib import Path
from typing import Any
INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist')
VARIOUS_ARTISTS = 'Various Artists'
MAIN_ARTIST_THRESHOLD = 0.7
ARTIST_SPLIT_PATTERN = re.compile(
r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*',
re.IGNORECASE
)
COMPILATION_KEYWORDS = (
'top',
'hits',
'best',
'精选',
'ost',
'original soundtrack',
'soundtrack',
'原声带'
)
def merge_metadata_layers(
raw_metadata: dict[str, Any] | None,
matched_metadata: dict[str, Any] | None,
metadata_patch: dict[str, Any] | None = None
) -> dict[str, Any]:
merged = dict(raw_metadata or {})
merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')})
merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None})
return merged
def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]:
normalized = dict(metadata or {})
for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'):
if key in normalized and normalized[key] is None:
normalized[key] = ''
if 'compilation' in normalized and normalized['compilation'] in ('', None):
normalized['compilation'] = 0
return normalized
def can_ingest_metadata(metadata: dict[str, Any]) -> bool:
return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS)
class MetadataNormalizationService:
def __init__(self, task_store):
self.task_store = task_store
def create_cache(self) -> dict[str, dict[Any, Any]]:
return {
'task_items': {},
'group_entries': {},
'group_analysis': {}
}
def normalize_item(
self,
item: dict,
metadata_patch: dict[str, Any] | None = None,
cache: dict[str, dict[Any, Any]] | None = None
) -> dict[str, Any]:
merged = merge_metadata_layers(
item.get('original_tags_json'),
item.get('matched_metadata_json'),
metadata_patch
)
group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache)
return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key)
def _build_group_entries(
self,
item: dict,
metadata_patch: dict[str, Any] | None,
cache: dict[str, dict[Any, Any]] | None
) -> tuple[Any, list[dict[str, Any]]]:
current_merged = merge_metadata_layers(
item.get('original_tags_json'),
item.get('matched_metadata_json'),
metadata_patch
)
task_items = self._get_task_items(item['task_id'], cache)
current_group_key = self._group_key(current_merged, item)
cache_key = self._group_cache_key(item, current_group_key, metadata_patch)
if cache is not None and cache_key in cache['group_entries']:
return cache_key, cache['group_entries'][cache_key]
entries: list[dict[str, Any]] = []
for candidate in task_items:
candidate_patch = metadata_patch if candidate['id'] == item['id'] else None
merged = merge_metadata_layers(
candidate.get('original_tags_json'),
candidate.get('matched_metadata_json'),
candidate_patch
)
if self._group_key(merged, candidate) != current_group_key:
continue
entries.append(
{
'item_id': candidate['id'],
'metadata': merged,
'artist_info': parse_artist_string(merged.get('artist'))
}
)
if not entries:
entries = [
{
'item_id': item['id'],
'metadata': current_merged,
'artist_info': parse_artist_string(current_merged.get('artist'))
}
]
if cache is not None:
cache['group_entries'][cache_key] = entries
return cache_key, entries
def _get_task_items(
self,
task_id: str,
cache: dict[str, dict[Any, Any]] | None
) -> list[dict[str, Any]]:
if cache is None:
return self.task_store.list_all_task_items(task_id, active_only=True)
task_items = cache['task_items'].get(task_id)
if task_items is None:
task_items = self.task_store.list_all_task_items(task_id, active_only=True)
cache['task_items'][task_id] = task_items
return task_items
def _group_cache_key(
self,
item: dict,
current_group_key: tuple[str, str],
metadata_patch: dict[str, Any] | None
) -> tuple[Any, ...]:
patch_key = self._metadata_patch_cache_key(metadata_patch)
if patch_key is None:
return (item['task_id'], current_group_key)
return (item['task_id'], current_group_key, item['id'], patch_key)
def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None:
if not metadata_patch:
return None
return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items()))
def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]:
for key in ('release_id', 'release_group_id'):
value = _clean_token(metadata.get(key))
if value:
return (key, value)
album = _clean_token(metadata.get('album'))
if album:
return ('album', album)
parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix()
if parent_dir and parent_dir != '.':
return ('path', _clean_token(parent_dir))
return ('item', str(item['id']))
def _normalize_merged_metadata(
self,
merged: dict[str, Any],
group_entries: list[dict[str, Any]],
cache: dict[str, dict[Any, Any]] | None,
group_cache_key: Any
) -> dict[str, Any]:
normalized = dict(merged)
artist_value = str(normalized.get('artist') or '').strip()
album_artist_value = str(normalized.get('album_artist') or '').strip()
album_value = str(normalized.get('album') or '').strip()
artist_info = parse_artist_string(artist_value)
if album_artist_value:
normalized['album_artist'] = album_artist_value
normalized['normalization_strategy'] = 'source_preserved'
normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家'
normalized['artist_tokens'] = artist_info['tokens']
normalized['display_artist'] = artist_info['display_artist']
normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0
return normalize_metadata_shape(normalized)
group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key)
dominant_artist = group_analysis['dominant_artist']
dominant_ratio = group_analysis['dominant_ratio']
unique_main_artists = group_analysis['unique_main_artists']
has_collaboration_markup = group_analysis['has_collaboration_markup']
compilation_keyword_hit = _has_compilation_keyword(album_value)
strategy = 'unresolved'
album_artist = ''
reason = '无法从当前专辑分组推导专辑艺术家'
compilation = 0
if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup:
strategy = 'single_artist'
album_artist = dominant_artist
reason = '同专辑曲目的主艺人一致,按单艺人专辑处理'
elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup:
strategy = 'main_artist_feat'
album_artist = dominant_artist
reason = '同专辑主艺人一致,但存在 feat/合作曲目,按主艺人专辑处理'
elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD:
strategy = 'main_artist_feat'
album_artist = dominant_artist
reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}'
elif len(unique_main_artists) > 1 and compilation_keyword_hit:
strategy = 'compilation'
album_artist = VARIOUS_ARTISTS
reason = '多艺人分散且专辑名命中合辑/原声带关键词'
compilation = 1
elif dominant_artist:
strategy = 'dominant_artist_fallback'
album_artist = dominant_artist
reason = f'未命中合辑规则,回退到出现频次最高的主艺人 {dominant_artist}'
elif artist_value:
strategy = 'single_track_fallback'
album_artist = artist_info['primary'] or artist_value
reason = '仅有当前曲目可用,回退到当前曲目艺人'
normalized['album_artist'] = album_artist
normalized['compilation'] = compilation
normalized['normalization_strategy'] = strategy
normalized['album_artist_reason'] = reason
normalized['artist_tokens'] = artist_info['tokens']
normalized['display_artist'] = artist_info['display_artist']
return normalize_metadata_shape(normalized)
def _analyze_group_entries(
self,
group_entries: list[dict[str, Any]],
cache: dict[str, dict[Any, Any]] | None,
group_cache_key: Any
) -> dict[str, Any]:
if cache is not None and group_cache_key in cache['group_analysis']:
return cache['group_analysis'][group_cache_key]
main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']]
has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries)
main_artist_counts = Counter(main_artists)
unique_main_artists = set(main_artists)
dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0)
analysis = {
'dominant_artist': dominant_artist,
'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0,
'unique_main_artists': unique_main_artists,
'has_collaboration_markup': has_collaboration_markup
}
if cache is not None:
cache['group_analysis'][group_cache_key] = analysis
return analysis
def parse_artist_string(value: Any) -> dict[str, Any]:
display_artist = str(value or '').strip()
if not display_artist:
return {'display_artist': '', 'tokens': [], 'primary': ''}
normalized = unicodedata.normalize('NFKC', display_artist)
tokens = [
_normalize_artist_token(token)
for token in ARTIST_SPLIT_PATTERN.split(normalized)
if _normalize_artist_token(token)
]
if not tokens:
tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else []
return {
'display_artist': display_artist,
'tokens': tokens,
'primary': tokens[0] if tokens else ''
}
def _normalize_artist_token(value: str) -> str:
cleaned = unicodedata.normalize('NFKC', str(value or '')).strip()
cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned
def _clean_token(value: Any) -> str:
cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower()
cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned
def _has_compilation_keyword(album: str) -> bool:
normalized = _clean_token(album)
return any(keyword in normalized for keyword in COMPILATION_KEYWORDS)
def _truthy_compilation(value: Any) -> bool:
if isinstance(value, bool):
return value
if isinstance(value, (int, float)):
return value != 0
return str(value or '').strip().lower() in {'1', 'true', 'yes'}