Add MusicWorkshop application

2026-04-30 14:34:28 +08:00
parent 4cb403c956
commit 796f19990f
62 changed files with 21614 additions and 2168 deletions
@@ -0,0 +1,309 @@
+from __future__ import annotations
+
+import re
+import unicodedata
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+
+INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist')
+VARIOUS_ARTISTS = 'Various Artists'
+MAIN_ARTIST_THRESHOLD = 0.7
+ARTIST_SPLIT_PATTERN = re.compile(
+  r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*',
+  re.IGNORECASE
+)
+COMPILATION_KEYWORDS = (
+  'top',
+  'hits',
+  'best',
+  '精选',
+  'ost',
+  'original soundtrack',
+  'soundtrack',
+  '原声带'
+)
+
+
+def merge_metadata_layers(
+  raw_metadata: dict[str, Any] | None,
+  matched_metadata: dict[str, Any] | None,
+  metadata_patch: dict[str, Any] | None = None
+) -> dict[str, Any]:
+  merged = dict(raw_metadata or {})
+  merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')})
+  merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None})
+  return merged
+
+
+def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]:
+  normalized = dict(metadata or {})
+  for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'):
+    if key in normalized and normalized[key] is None:
+      normalized[key] = ''
+  if 'compilation' in normalized and normalized['compilation'] in ('', None):
+    normalized['compilation'] = 0
+  return normalized
+
+
+def can_ingest_metadata(metadata: dict[str, Any]) -> bool:
+  return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS)
+
+
+class MetadataNormalizationService:
+  def __init__(self, task_store):
+    self.task_store = task_store
+
+  def create_cache(self) -> dict[str, dict[Any, Any]]:
+    return {
+      'task_items': {},
+      'group_entries': {},
+      'group_analysis': {}
+    }
+
+  def normalize_item(
+    self,
+    item: dict,
+    metadata_patch: dict[str, Any] | None = None,
+    cache: dict[str, dict[Any, Any]] | None = None
+  ) -> dict[str, Any]:
+    merged = merge_metadata_layers(
+      item.get('original_tags_json'),
+      item.get('matched_metadata_json'),
+      metadata_patch
+    )
+    group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache)
+    return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key)
+
+  def _build_group_entries(
+    self,
+    item: dict,
+    metadata_patch: dict[str, Any] | None,
+    cache: dict[str, dict[Any, Any]] | None
+  ) -> tuple[Any, list[dict[str, Any]]]:
+    current_merged = merge_metadata_layers(
+      item.get('original_tags_json'),
+      item.get('matched_metadata_json'),
+      metadata_patch
+    )
+    task_items = self._get_task_items(item['task_id'], cache)
+    current_group_key = self._group_key(current_merged, item)
+    cache_key = self._group_cache_key(item, current_group_key, metadata_patch)
+    if cache is not None and cache_key in cache['group_entries']:
+      return cache_key, cache['group_entries'][cache_key]
+    entries: list[dict[str, Any]] = []
+
+    for candidate in task_items:
+      candidate_patch = metadata_patch if candidate['id'] == item['id'] else None
+      merged = merge_metadata_layers(
+        candidate.get('original_tags_json'),
+        candidate.get('matched_metadata_json'),
+        candidate_patch
+      )
+      if self._group_key(merged, candidate) != current_group_key:
+        continue
+      entries.append(
+        {
+          'item_id': candidate['id'],
+          'metadata': merged,
+          'artist_info': parse_artist_string(merged.get('artist'))
+        }
+      )
+
+    if not entries:
+      entries = [
+        {
+          'item_id': item['id'],
+          'metadata': current_merged,
+          'artist_info': parse_artist_string(current_merged.get('artist'))
+        }
+      ]
+
+    if cache is not None:
+      cache['group_entries'][cache_key] = entries
+
+    return cache_key, entries
+
+  def _get_task_items(
+    self,
+    task_id: str,
+    cache: dict[str, dict[Any, Any]] | None
+  ) -> list[dict[str, Any]]:
+    if cache is None:
+      return self.task_store.list_all_task_items(task_id, active_only=True)
+
+    task_items = cache['task_items'].get(task_id)
+    if task_items is None:
+      task_items = self.task_store.list_all_task_items(task_id, active_only=True)
+      cache['task_items'][task_id] = task_items
+    return task_items
+
+  def _group_cache_key(
+    self,
+    item: dict,
+    current_group_key: tuple[str, str],
+    metadata_patch: dict[str, Any] | None
+  ) -> tuple[Any, ...]:
+    patch_key = self._metadata_patch_cache_key(metadata_patch)
+    if patch_key is None:
+      return (item['task_id'], current_group_key)
+    return (item['task_id'], current_group_key, item['id'], patch_key)
+
+  def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None:
+    if not metadata_patch:
+      return None
+    return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items()))
+
+  def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]:
+    for key in ('release_id', 'release_group_id'):
+      value = _clean_token(metadata.get(key))
+      if value:
+        return (key, value)
+
+    album = _clean_token(metadata.get('album'))
+    if album:
+      return ('album', album)
+
+    parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix()
+    if parent_dir and parent_dir != '.':
+      return ('path', _clean_token(parent_dir))
+
+    return ('item', str(item['id']))
+
+  def _normalize_merged_metadata(
+    self,
+    merged: dict[str, Any],
+    group_entries: list[dict[str, Any]],
+    cache: dict[str, dict[Any, Any]] | None,
+    group_cache_key: Any
+  ) -> dict[str, Any]:
+    normalized = dict(merged)
+    artist_value = str(normalized.get('artist') or '').strip()
+    album_artist_value = str(normalized.get('album_artist') or '').strip()
+    album_value = str(normalized.get('album') or '').strip()
+    artist_info = parse_artist_string(artist_value)
+
+    if album_artist_value:
+      normalized['album_artist'] = album_artist_value
+      normalized['normalization_strategy'] = 'source_preserved'
+      normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家'
+      normalized['artist_tokens'] = artist_info['tokens']
+      normalized['display_artist'] = artist_info['display_artist']
+      normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0
+      return normalize_metadata_shape(normalized)
+
+    group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key)
+    dominant_artist = group_analysis['dominant_artist']
+    dominant_ratio = group_analysis['dominant_ratio']
+    unique_main_artists = group_analysis['unique_main_artists']
+    has_collaboration_markup = group_analysis['has_collaboration_markup']
+    compilation_keyword_hit = _has_compilation_keyword(album_value)
+
+    strategy = 'unresolved'
+    album_artist = ''
+    reason = '无法从当前专辑分组推导专辑艺术家'
+    compilation = 0
+
+    if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup:
+      strategy = 'single_artist'
+      album_artist = dominant_artist
+      reason = '同专辑曲目的主艺人一致，按单艺人专辑处理'
+    elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup:
+      strategy = 'main_artist_feat'
+      album_artist = dominant_artist
+      reason = '同专辑主艺人一致，但存在 feat/合作曲目，按主艺人专辑处理'
+    elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD:
+      strategy = 'main_artist_feat'
+      album_artist = dominant_artist
+      reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}'
+    elif len(unique_main_artists) > 1 and compilation_keyword_hit:
+      strategy = 'compilation'
+      album_artist = VARIOUS_ARTISTS
+      reason = '多艺人分散且专辑名命中合辑/原声带关键词'
+      compilation = 1
+    elif dominant_artist:
+      strategy = 'dominant_artist_fallback'
+      album_artist = dominant_artist
+      reason = f'未命中合辑规则，回退到出现频次最高的主艺人 {dominant_artist}'
+    elif artist_value:
+      strategy = 'single_track_fallback'
+      album_artist = artist_info['primary'] or artist_value
+      reason = '仅有当前曲目可用，回退到当前曲目艺人'
+
+    normalized['album_artist'] = album_artist
+    normalized['compilation'] = compilation
+    normalized['normalization_strategy'] = strategy
+    normalized['album_artist_reason'] = reason
+    normalized['artist_tokens'] = artist_info['tokens']
+    normalized['display_artist'] = artist_info['display_artist']
+    return normalize_metadata_shape(normalized)
+
+  def _analyze_group_entries(
+    self,
+    group_entries: list[dict[str, Any]],
+    cache: dict[str, dict[Any, Any]] | None,
+    group_cache_key: Any
+  ) -> dict[str, Any]:
+    if cache is not None and group_cache_key in cache['group_analysis']:
+      return cache['group_analysis'][group_cache_key]
+
+    main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']]
+    has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries)
+    main_artist_counts = Counter(main_artists)
+    unique_main_artists = set(main_artists)
+    dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0)
+    analysis = {
+      'dominant_artist': dominant_artist,
+      'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0,
+      'unique_main_artists': unique_main_artists,
+      'has_collaboration_markup': has_collaboration_markup
+    }
+    if cache is not None:
+      cache['group_analysis'][group_cache_key] = analysis
+    return analysis
+
+
+def parse_artist_string(value: Any) -> dict[str, Any]:
+  display_artist = str(value or '').strip()
+  if not display_artist:
+    return {'display_artist': '', 'tokens': [], 'primary': ''}
+
+  normalized = unicodedata.normalize('NFKC', display_artist)
+  tokens = [
+    _normalize_artist_token(token)
+    for token in ARTIST_SPLIT_PATTERN.split(normalized)
+    if _normalize_artist_token(token)
+  ]
+  if not tokens:
+    tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else []
+  return {
+    'display_artist': display_artist,
+    'tokens': tokens,
+    'primary': tokens[0] if tokens else ''
+  }
+
+
+def _normalize_artist_token(value: str) -> str:
+  cleaned = unicodedata.normalize('NFKC', str(value or '')).strip()
+  cleaned = re.sub(r'\s+', ' ', cleaned)
+  return cleaned
+
+
+def _clean_token(value: Any) -> str:
+  cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower()
+  cleaned = re.sub(r'\s+', ' ', cleaned)
+  return cleaned
+
+
+def _has_compilation_keyword(album: str) -> bool:
+  normalized = _clean_token(album)
+  return any(keyword in normalized for keyword in COMPILATION_KEYWORDS)
+
+
+def _truthy_compilation(value: Any) -> bool:
+  if isinstance(value, bool):
+    return value
+  if isinstance(value, (int, float)):
+    return value != 0
+  return str(value or '').strip().lower() in {'1', 'true', 'yes'}