MusicWorkshop/backend/app/metadata_normalization.py

from __future__ import annotations

import re
import unicodedata
from collections import Counter
from pathlib import Path
from typing import Any


INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist')
VARIOUS_ARTISTS = 'Various Artists'
MAIN_ARTIST_THRESHOLD = 0.7
ARTIST_SPLIT_PATTERN = re.compile(
  r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*',
  re.IGNORECASE
)
COMPILATION_KEYWORDS = (
  'top',
  'hits',
  'best',
  '精选',
  'ost',
  'original soundtrack',
  'soundtrack',
  '原声带'
)


def merge_metadata_layers(
  raw_metadata: dict[str, Any] | None,
  matched_metadata: dict[str, Any] | None,
  metadata_patch: dict[str, Any] | None = None
) -> dict[str, Any]:
  merged = dict(raw_metadata or {})
  merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')})
  merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None})
  return merged


def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]:
  normalized = dict(metadata or {})
  for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'):
    if key in normalized and normalized[key] is None:
      normalized[key] = ''
  if 'compilation' in normalized and normalized['compilation'] in ('', None):
    normalized['compilation'] = 0
  return normalized


def can_ingest_metadata(metadata: dict[str, Any]) -> bool:
  return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS)


class MetadataNormalizationService:
  def __init__(self, task_store):
    self.task_store = task_store

  def create_cache(self) -> dict[str, dict[Any, Any]]:
    return {
      'task_items': {},
      'group_entries': {},
      'group_analysis': {}
    }

  def normalize_item(
    self,
    item: dict,
    metadata_patch: dict[str, Any] | None = None,
    cache: dict[str, dict[Any, Any]] | None = None
  ) -> dict[str, Any]:
    merged = merge_metadata_layers(
      item.get('original_tags_json'),
      item.get('matched_metadata_json'),
      metadata_patch
    )
    group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache)
    return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key)

  def _build_group_entries(
    self,
    item: dict,
    metadata_patch: dict[str, Any] | None,
    cache: dict[str, dict[Any, Any]] | None
  ) -> tuple[Any, list[dict[str, Any]]]:
    current_merged = merge_metadata_layers(
      item.get('original_tags_json'),
      item.get('matched_metadata_json'),
      metadata_patch
    )
    task_items = self._get_task_items(item['task_id'], cache)
    current_group_key = self._group_key(current_merged, item)
    cache_key = self._group_cache_key(item, current_group_key, metadata_patch)
    if cache is not None and cache_key in cache['group_entries']:
      return cache_key, cache['group_entries'][cache_key]
    entries: list[dict[str, Any]] = []

    for candidate in task_items:
      candidate_patch = metadata_patch if candidate['id'] == item['id'] else None
      merged = merge_metadata_layers(
        candidate.get('original_tags_json'),
        candidate.get('matched_metadata_json'),
        candidate_patch
      )
      if self._group_key(merged, candidate) != current_group_key:
        continue
      entries.append(
        {
          'item_id': candidate['id'],
          'metadata': merged,
          'artist_info': parse_artist_string(merged.get('artist'))
        }
      )

    if not entries:
      entries = [
        {
          'item_id': item['id'],
          'metadata': current_merged,
          'artist_info': parse_artist_string(current_merged.get('artist'))
        }
      ]

    if cache is not None:
      cache['group_entries'][cache_key] = entries

    return cache_key, entries

  def _get_task_items(
    self,
    task_id: str,
    cache: dict[str, dict[Any, Any]] | None
  ) -> list[dict[str, Any]]:
    if cache is None:
      return self.task_store.list_all_task_items(task_id, active_only=True)

    task_items = cache['task_items'].get(task_id)
    if task_items is None:
      task_items = self.task_store.list_all_task_items(task_id, active_only=True)
      cache['task_items'][task_id] = task_items
    return task_items

  def _group_cache_key(
    self,
    item: dict,
    current_group_key: tuple[str, str],
    metadata_patch: dict[str, Any] | None
  ) -> tuple[Any, ...]:
    patch_key = self._metadata_patch_cache_key(metadata_patch)
    if patch_key is None:
      return (item['task_id'], current_group_key)
    return (item['task_id'], current_group_key, item['id'], patch_key)

  def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None:
    if not metadata_patch:
      return None
    return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items()))

  def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]:
    for key in ('release_id', 'release_group_id'):
      value = _clean_token(metadata.get(key))
      if value:
        return (key, value)

    album = _clean_token(metadata.get('album'))
    if album:
      return ('album', album)

    parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix()
    if parent_dir and parent_dir != '.':
      return ('path', _clean_token(parent_dir))

    return ('item', str(item['id']))

  def _normalize_merged_metadata(
    self,
    merged: dict[str, Any],
    group_entries: list[dict[str, Any]],
    cache: dict[str, dict[Any, Any]] | None,
    group_cache_key: Any
  ) -> dict[str, Any]:
    normalized = dict(merged)
    artist_value = str(normalized.get('artist') or '').strip()
    album_artist_value = str(normalized.get('album_artist') or '').strip()
    album_value = str(normalized.get('album') or '').strip()
    artist_info = parse_artist_string(artist_value)

    if album_artist_value:
      normalized['album_artist'] = album_artist_value
      normalized['normalization_strategy'] = 'source_preserved'
      normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家'
      normalized['artist_tokens'] = artist_info['tokens']
      normalized['display_artist'] = artist_info['display_artist']
      normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0
      return normalize_metadata_shape(normalized)

    group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key)
    dominant_artist = group_analysis['dominant_artist']
    dominant_ratio = group_analysis['dominant_ratio']
    unique_main_artists = group_analysis['unique_main_artists']
    has_collaboration_markup = group_analysis['has_collaboration_markup']
    compilation_keyword_hit = _has_compilation_keyword(album_value)

    strategy = 'unresolved'
    album_artist = ''
    reason = '无法从当前专辑分组推导专辑艺术家'
    compilation = 0

    if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup:
      strategy = 'single_artist'
      album_artist = dominant_artist
      reason = '同专辑曲目的主艺人一致，按单艺人专辑处理'
    elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup:
      strategy = 'main_artist_feat'
      album_artist = dominant_artist
      reason = '同专辑主艺人一致，但存在 feat/合作曲目，按主艺人专辑处理'
    elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD:
      strategy = 'main_artist_feat'
      album_artist = dominant_artist
      reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}'
    elif len(unique_main_artists) > 1 and compilation_keyword_hit:
      strategy = 'compilation'
      album_artist = VARIOUS_ARTISTS
      reason = '多艺人分散且专辑名命中合辑/原声带关键词'
      compilation = 1
    elif dominant_artist:
      strategy = 'dominant_artist_fallback'
      album_artist = dominant_artist
      reason = f'未命中合辑规则，回退到出现频次最高的主艺人 {dominant_artist}'
    elif artist_value:
      strategy = 'single_track_fallback'
      album_artist = artist_info['primary'] or artist_value
      reason = '仅有当前曲目可用，回退到当前曲目艺人'

    normalized['album_artist'] = album_artist
    normalized['compilation'] = compilation
    normalized['normalization_strategy'] = strategy
    normalized['album_artist_reason'] = reason
    normalized['artist_tokens'] = artist_info['tokens']
    normalized['display_artist'] = artist_info['display_artist']
    return normalize_metadata_shape(normalized)

  def _analyze_group_entries(
    self,
    group_entries: list[dict[str, Any]],
    cache: dict[str, dict[Any, Any]] | None,
    group_cache_key: Any
  ) -> dict[str, Any]:
    if cache is not None and group_cache_key in cache['group_analysis']:
      return cache['group_analysis'][group_cache_key]

    main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']]
    has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries)
    main_artist_counts = Counter(main_artists)
    unique_main_artists = set(main_artists)
    dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0)
    analysis = {
      'dominant_artist': dominant_artist,
      'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0,
      'unique_main_artists': unique_main_artists,
      'has_collaboration_markup': has_collaboration_markup
    }
    if cache is not None:
      cache['group_analysis'][group_cache_key] = analysis
    return analysis


def parse_artist_string(value: Any) -> dict[str, Any]:
  display_artist = str(value or '').strip()
  if not display_artist:
    return {'display_artist': '', 'tokens': [], 'primary': ''}

  normalized = unicodedata.normalize('NFKC', display_artist)
  tokens = [
    _normalize_artist_token(token)
    for token in ARTIST_SPLIT_PATTERN.split(normalized)
    if _normalize_artist_token(token)
  ]
  if not tokens:
    tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else []
  return {
    'display_artist': display_artist,
    'tokens': tokens,
    'primary': tokens[0] if tokens else ''
  }


def _normalize_artist_token(value: str) -> str:
  cleaned = unicodedata.normalize('NFKC', str(value or '')).strip()
  cleaned = re.sub(r'\s+', ' ', cleaned)
  return cleaned


def _clean_token(value: Any) -> str:
  cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower()
  cleaned = re.sub(r'\s+', ' ', cleaned)
  return cleaned


def _has_compilation_keyword(album: str) -> bool:
  normalized = _clean_token(album)
  return any(keyword in normalized for keyword in COMPILATION_KEYWORDS)


def _truthy_compilation(value: Any) -> bool:
  if isinstance(value, bool):
    return value
  if isinstance(value, (int, float)):
    return value != 0
  return str(value or '').strip().lower() in {'1', 'true', 'yes'}