diff --git a/.gitignore b/.gitignore index 94966f7..b4d88f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,11 @@ frontend/node_modules/ frontend/dist/ .DS_Store +.idea/ +.pytest_cache/ +.dev-runtime/ +backend/data/ +backend/.venv/ +__pycache__/ +services/metadata/src/ +services/metadata/.env.local diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..29eceb5 --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1,4 @@ +from .scanner import probe_local_assets + + +__all__ = ['probe_local_assets'] diff --git a/backend/app/defaults.py b/backend/app/defaults.py new file mode 100644 index 0000000..0104b85 --- /dev/null +++ b/backend/app/defaults.py @@ -0,0 +1,66 @@ +def create_default_config(): + return { + 'input': '/volume1/downloads/music', + 'output': '/volume1/docker/navidrome/music', + 'trash': '/volume1/docker/navidrome/trash', + 'schedule': { + 'enabled': True, + 'type': 'daily', + 'dayOfWeek': '1', + 'time': '02:00', + 'cron': '0 2 * * *' + }, + 'advancedStrategy': { + 'metadataFallback': True, + 'downloadAssets': True, + 'replaceLowQualityDuplicates': False + }, + 'notifications': { + 'dingtalkWebhook': '', + 'dingtalkSecret': '', + 'telegramBotToken': '', + 'telegramChatId': '', + 'emailSmtp': '', + 'emailUser': '', + 'emailPass': '', + 'emailTo': '' + }, + 'metadata': { + 'acoustidUrl': 'https://api.acoustid.org/v2', + 'acoustidClientKey': '', + 'musicbrainz': 'https://musicbrainz.org/ws/2/', + 'netease': 'http://localhost:3000', + 'qq': 'http://localhost:3300', + 'spotifyUrl': 'https://api.spotify.com/v1', + 'spotifyClientId': '', + 'spotifySecret': '', + 'discogsUrl': 'https://api.discogs.com', + 'discogsToken': '', + 'lastfmUrl': 'https://ws.audioscrobbler.com/2.0/', + 'lastfmKey': '', + 'geniusUrl': 'https://api.genius.com', + 'geniusToken': '' + } + } + + +def merge_config(partial_config): + defaults = create_default_config() + merged = defaults | { + key: value for key, value in partial_config.items() if key in defaults + } + + for nested_key in ('schedule', 'advancedStrategy', 'notifications', 'metadata'): + nested_value = partial_config.get(nested_key, {}) + if isinstance(nested_value, dict): + merged[nested_key] = defaults[nested_key] | nested_value + + return merged + + +def derive_task_state(config): + return ( + 'ready' + if config['input'].strip() and config['output'].strip() and config['trash'].strip() + else 'unconfigured' + ) diff --git a/backend/app/exception_service.py b/backend/app/exception_service.py new file mode 100644 index 0000000..14c3fe6 --- /dev/null +++ b/backend/app/exception_service.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .metadata_normalization import ( + MetadataNormalizationService, + can_ingest_metadata, + merge_metadata_layers, + normalize_metadata_shape +) +from .task_constants import current_timestamp + + +EXCEPTION_TYPE_LABELS = { + 'missing_tags': '元数据缺失', + 'duplicates': '文件重复', + 'match_failed': '匹配失败', + 'low_score': '匹配分过低', + 'convert_failed': '转码失败', + 'organize_failed': '入库失败' +} + +EXCEPTION_TYPES = tuple(EXCEPTION_TYPE_LABELS.keys()) +READ_ONLY_ACTIONS: list[str] = [] +ACTION_RULES = { + 'missing_tags': ['retry_match', 'edit_metadata', 'save_and_organize', 'ignore_exception', 'delete_file'], + 'match_failed': ['retry_match', 'edit_metadata', 'save_and_organize', 'ignore_exception', 'delete_file'], + 'low_score': [ + 'retry_match', + 'select_match_candidate', + 'edit_metadata', + 'save_and_organize', + 'ignore_exception', + 'delete_file' + ], + 'duplicates': ['keep_existing', 'replace_existing', 'keep_both_with_rename', 'ignore_exception', 'delete_file'], + 'convert_failed': ['retry_preprocess', 'move_to_review_trash', 'ignore_exception', 'delete_file'], + 'organize_failed': ['retry_organize', 'edit_target_path', 'move_to_review_trash', 'ignore_exception', 'delete_file'] +} + + +class ExceptionItemNotFoundError(Exception): + pass + + +class ExceptionService: + def __init__(self, task_store): + self.task_store = task_store + self.metadata_normalizer = MetadataNormalizationService(task_store) + + def get_summary(self) -> dict: + counts_by_type = {exception_type: 0 for exception_type in EXCEPTION_TYPES} + total = 0 + + for source_item in self.task_store.list_exception_source_items('all'): + exception_index = self._identify_exception_item(source_item) + if exception_index is None: + continue + counts_by_type[exception_index['exception_type']] += 1 + total += 1 + + return { + 'total': total, + 'counts_by_type': counts_by_type, + 'scanned_at': current_timestamp() + } + + def get_items( + self, + exception_type: str = 'all', + page: int = 1, + page_size: int = 50, + resolution_status: str = 'open' + ) -> dict: + if exception_type != 'all' and exception_type not in EXCEPTION_TYPES: + raise ValueError(f'Unsupported exception type: {exception_type}') + + indexed_items = [] + for source_item in self.task_store.list_exception_source_items(resolution_status): + exception_index = self._identify_exception_item(source_item) + if exception_index is None: + continue + if exception_type != 'all' and exception_index['exception_type'] != exception_type: + continue + indexed_items.append(exception_index) + + total = len(indexed_items) + offset = (page - 1) * page_size + page_indexes = indexed_items[offset:offset + page_size] + normalization_cache = self.metadata_normalizer.create_cache() + return { + 'items': [ + self._build_exception_item( + page_index['source_item'], + include_detail=False, + exception_index=page_index, + normalization_cache=normalization_cache + ) + for page_index in page_indexes + ], + 'page': page, + 'page_size': page_size, + 'total': total + } + + def get_item(self, exception_id: int) -> dict: + source_item = self.task_store.get_exception_source_item(exception_id) + if source_item is None: + raise ExceptionItemNotFoundError(exception_id) + + exception_item = self._build_exception_item(source_item, include_detail=True) + if exception_item is None: + raise ExceptionItemNotFoundError(exception_id) + return exception_item + + def resolve_audio_path(self, exception_id: int) -> Path: + item = self.get_item(exception_id) + candidates = [ + item.get('current_file_path'), + item.get('trash_file_path') + ] + + for candidate in candidates: + if not candidate: + continue + path = Path(candidate) + if path.exists() and path.is_file(): + return path + + raise FileNotFoundError(f'No playable audio found for exception item: {exception_id}') + + def _identify_exception_item(self, source_item: dict) -> dict | None: + resolution = source_item.get('exception_resolution_json') or {} + exception_state = self._resolve_exception_state(source_item) + if exception_state is None: + exception_state = self._resolve_from_resolution_snapshot(source_item) + if exception_state is None: + return None + + exception_type, exception_stage, exception_reason_code, exception_message = exception_state + return { + 'source_item': source_item, + 'resolution': resolution, + 'exception_type': exception_type, + 'exception_stage': exception_stage, + 'exception_reason_code': exception_reason_code, + 'exception_message': exception_message, + 'display_reason': exception_message or self._default_reason(exception_type) + } + + def _build_exception_item( + self, + source_item: dict, + *, + include_detail: bool, + exception_index: dict | None = None, + normalization_cache: dict[str, dict[Any, Any]] | None = None + ) -> dict | None: + exception_index = exception_index or self._identify_exception_item(source_item) + if exception_index is None: + return None + + resolution = exception_index['resolution'] + raw_metadata = self._normalize_metadata(source_item.get('original_tags_json')) + matched_metadata = self._normalize_metadata(source_item.get('matched_metadata_json')) + metadata_draft = self._normalize_metadata(resolution.get('metadata_draft')) + effective_metadata = self._build_effective_metadata( + source_item, + raw_metadata, + matched_metadata, + metadata_draft, + normalization_cache + ) + workflow_state = self._resolve_workflow_state(source_item, effective_metadata) + can_ingest = self._can_ingest(effective_metadata) + pending_ingest = ( + source_item.get('exception_resolution_status') == 'open' + and workflow_state in {'candidate_selected', 'ready_to_ingest'} + ) + display_title = self._first_non_empty(effective_metadata.get('title'), source_item.get('filename')) or '-' + + payload = { + 'exception_id': source_item['id'], + 'task_id': source_item['task_id'], + 'task_started_at': source_item['task_started_at'], + 'exception_type': exception_index['exception_type'], + 'exception_stage': exception_index['exception_stage'], + 'exception_reason_code': exception_index['exception_reason_code'], + 'exception_message': exception_index['exception_message'], + 'captured_at': source_item['updated_at'], + 'filename': source_item['filename'], + 'relative_path': source_item['relative_path'], + 'original_path': source_item['original_path'], + 'current_file_path': source_item['current_file_path'], + 'trash_file_path': source_item.get('trash_file_path'), + 'audio_props_json': source_item.get('audio_props_json'), + 'original_tags_json': source_item.get('original_tags_json'), + 'matched_metadata_json': source_item.get('matched_metadata_json'), + 'duplicate_of_path': source_item.get('duplicate_of_path'), + 'dedupe_decision_json': source_item.get('dedupe_decision_json'), + 'library_relative_path': source_item.get('library_relative_path'), + 'library_file_path': source_item.get('library_file_path'), + 'match_source': source_item.get('match_source'), + 'match_confidence': source_item.get('match_confidence'), + 'preview_available': False, + 'available_actions': self._available_actions_for(exception_index['exception_type'], source_item, can_ingest), + 'exception_resolution_status': source_item.get('exception_resolution_status') or 'open', + 'exception_resolution_json': resolution, + 'workflow_state': workflow_state, + 'raw_metadata': raw_metadata, + 'metadata_draft': metadata_draft, + 'effective_metadata': effective_metadata, + 'normalization_strategy': effective_metadata.get('normalization_strategy'), + 'album_artist_reason': effective_metadata.get('album_artist_reason'), + 'compilation': int(effective_metadata.get('compilation') or 0), + 'can_ingest': can_ingest, + 'pending_ingest': pending_ingest, + 'display_title': display_title, + 'display_reason': exception_index['display_reason'], + 'type_label': EXCEPTION_TYPE_LABELS[exception_index['exception_type']] + } + + if include_detail: + payload.update( + { + 'preprocess_artifacts_json': source_item.get('preprocess_artifacts_json'), + 'match_candidates_json': source_item.get('match_candidates_json'), + 'match_enrichment_json': source_item.get('match_enrichment_json'), + 'organize_decision_json': source_item.get('organize_decision_json') + } + ) + + return payload + + def _resolve_from_resolution_snapshot( + self, + item: dict + ) -> tuple[str, str, str | None, str | None] | None: + resolution = item.get('exception_resolution_json') or {} + snapshot = resolution.get('after_snapshot') or resolution.get('before_snapshot') or {} + exception_type = snapshot.get('exception_type') + if exception_type not in EXCEPTION_TYPES: + return None + return ( + exception_type, + snapshot.get('exception_stage') or 'organize', + snapshot.get('exception_reason_code'), + snapshot.get('exception_message') + ) + + def _resolve_exception_state( + self, + item: dict + ) -> tuple[str, str, str | None, str | None] | None: + if item.get('organize_status') in {'trashed', 'failed'}: + return ( + 'organize_failed', + 'organize', + self._first_non_empty(item.get('organize_reason'), item.get('organize_status')), + self._first_non_empty(item.get('organize_message'), self._default_reason('organize_failed')) + ) + + if item.get('dedupe_status') == 'duplicate_trashed': + return ( + 'duplicates', + 'dedupe', + self._first_non_empty(item.get('dedupe_reason'), item.get('dedupe_status')), + self._first_non_empty(item.get('dedupe_message'), self._default_reason('duplicates')) + ) + + if item.get('dedupe_status') == 'failed': + return ( + 'duplicates', + 'dedupe', + self._first_non_empty(item.get('dedupe_reason'), item.get('dedupe_status')), + self._first_non_empty(item.get('dedupe_message'), self._default_reason('duplicates')) + ) + + if item.get('match_status') == 'low_score': + return ( + 'low_score', + 'match', + self._first_non_empty(item.get('match_reason'), item.get('match_status')), + self._first_non_empty(item.get('match_message'), self._default_reason('low_score')) + ) + + if item.get('match_status') in {'failed', 'not_found'}: + return ( + 'match_failed', + 'match', + self._first_non_empty(item.get('match_reason'), item.get('match_status')), + self._first_non_empty(item.get('match_message'), self._default_reason('match_failed')) + ) + + if ( + item.get('preprocess_status') == 'failed' + and item.get('preprocess_reason') == 'convert_failed' + ): + return ( + 'convert_failed', + 'preprocess', + item.get('preprocess_reason'), + self._first_non_empty(item.get('preprocess_message'), self._default_reason('convert_failed')) + ) + + preprocess_reason = item.get('preprocess_reason') or '' + if item.get('preprocess_status') == 'warning' and 'metadata_failed' in preprocess_reason: + return ( + 'missing_tags', + 'preprocess', + preprocess_reason, + self._first_non_empty(item.get('preprocess_message'), self._default_reason('missing_tags')) + ) + + return None + + def _default_reason(self, exception_type: str) -> str: + defaults = { + 'missing_tags': '无法提取有效元数据', + 'duplicates': '检测到重复文件', + 'match_failed': '未能完成元数据匹配', + 'low_score': '匹配候选分数过低', + 'convert_failed': '音频转码失败', + 'organize_failed': '整理入库失败' + } + return defaults[exception_type] + + def _available_actions_for(self, exception_type: str, item: dict, can_ingest: bool) -> list[str]: + resolution_status = item.get('exception_resolution_status') or 'open' + if resolution_status in {'resolved', 'ignored'}: + return READ_ONLY_ACTIONS.copy() + + actions = ACTION_RULES.get(exception_type, READ_ONLY_ACTIONS).copy() + + if exception_type == 'low_score' and not (item.get('match_candidates_json') or []): + actions = [action for action in actions if action != 'select_match_candidate'] + + if exception_type == 'organize_failed' and item.get('matched_metadata_json'): + return actions + + if exception_type == 'organize_failed': + return [action for action in actions if action != 'retry_organize'] + + return actions + + def _resolve_workflow_state(self, item: dict, effective_metadata: dict[str, Any]) -> str: + resolution_status = item.get('exception_resolution_status') or 'open' + resolution = item.get('exception_resolution_json') or {} + if resolution_status == 'ignored': + return 'ignored' + if resolution_status == 'resolved': + return 'deleted' if resolution.get('action') == 'delete_file' else 'ingested' + + workflow_state = resolution.get('workflow_state') + if workflow_state in {'open', 'candidate_selected', 'ready_to_ingest'}: + if workflow_state == 'candidate_selected' and self._can_ingest(effective_metadata): + return 'ready_to_ingest' + return workflow_state + + return 'ready_to_ingest' if self._can_ingest(effective_metadata) else 'open' + + def _normalize_metadata(self, metadata: dict[str, Any] | None) -> dict[str, Any]: + return normalize_metadata_shape(metadata) + + def _build_effective_metadata( + self, + source_item: dict[str, Any], + raw_metadata: dict[str, Any], + matched_metadata: dict[str, Any], + metadata_draft: dict[str, Any], + normalization_cache: dict[str, dict[Any, Any]] | None = None + ) -> dict[str, Any]: + metadata_patch = {key: value for key, value in metadata_draft.items() if value is not None} + merged = merge_metadata_layers(raw_metadata, matched_metadata, metadata_patch) + return self.metadata_normalizer.normalize_item( + { + **source_item, + 'original_tags_json': raw_metadata, + 'matched_metadata_json': matched_metadata + }, + metadata_patch, + cache=normalization_cache + ) if merged else {} + + def _can_ingest(self, metadata: dict[str, Any]) -> bool: + return can_ingest_metadata(metadata) + + def _first_non_empty(self, *values: Any) -> Any: + for value in values: + if isinstance(value, str): + if value.strip(): + return value + continue + if value is not None: + return value + return None diff --git a/backend/app/library_index.py b/backend/app/library_index.py new file mode 100644 index 0000000..00a09b3 --- /dev/null +++ b/backend/app/library_index.py @@ -0,0 +1,330 @@ +import hashlib +import importlib +import re +import unicodedata +from datetime import datetime, timezone +from pathlib import Path + +from .scanner import ALLOWED_AUDIO_EXTENSIONS + + +LOSSLESS_EXTENSIONS = {'.flac', '.wav', '.ape', '.aiff', '.alac'} +PRESERVED_VERSION_TOKENS = { + 'live', + 'remix', + 'demo', + 'karaoke', + 'instrumental', + 'cover' +} + + +def build_library_index( + output_dir: str, + *, + probe_audio, + read_tags +) -> dict: + items = scan_library_items( + output_dir, + probe_audio=probe_audio, + read_tags=read_tags + ) + by_basis = { + 'recording_id': {}, + 'release_track': {}, + 'text_duration': {} + } + + for item in items: + for basis, identity_key in identity_keys_for_item(item, include_fingerprint=False): + by_basis.setdefault(basis, {}).setdefault(identity_key, []).append(item) + + return { + 'count': len(items), + 'items': items, + 'by_basis': by_basis + } + + +def scan_library_items( + output_dir: str, + *, + probe_audio, + read_tags +) -> list[dict]: + output_root = Path(output_dir).expanduser().resolve(strict=False) + items: list[dict] = [] + + if not output_root.exists() or not output_root.is_dir(): + return items + + for file_path in sorted(output_root.rglob('*')): + if not file_path.is_file() or file_path.suffix.lower() not in ALLOWED_AUDIO_EXTENSIONS: + continue + + stat = file_path.stat() + modified_at = format_timestamp(stat.st_mtime) + relative_path = file_path.relative_to(output_root).as_posix() + absolute_path = str(file_path.resolve(strict=False)) + audio_props = safe_probe_audio(probe_audio, absolute_path) + tags = safe_read_tags(read_tags, absolute_path) + + items.append( + { + 'track_id': build_track_id(relative_path, stat.st_size, modified_at), + 'id': None, + 'current_file_path': absolute_path, + 'file_path': absolute_path, + 'library_file_path': absolute_path, + 'relative_path': relative_path, + 'library_relative_path': relative_path, + 'filename': file_path.name, + 'extension': file_path.suffix.lower(), + 'size_bytes': stat.st_size, + 'modified_at': modified_at, + 'audio_props_json': audio_props, + 'match_confidence': None, + 'match_is_authoritative': True, + 'matched_metadata_json': build_library_metadata(tags, audio_props, file_path), + 'acoustic_fingerprint': None, + 'fingerprint_duration_seconds': audio_props.get('duration_seconds'), + 'created_at': None + } + ) + + return items + + +def count_suspected_duplicates(items: list[dict]) -> int: + duplicate_count = 0 + + for group in group_items_by_identity(items): + if len(group) > 1: + duplicate_count += len(group) - 1 + + return duplicate_count + + +def group_items_by_identity(items: list[dict]) -> list[list[dict]]: + groups: list[list[dict]] = [] + indexed_groups: dict[tuple[str, str], list[dict]] = {} + + for item in items: + identity_keys = identity_keys_for_item(item, include_fingerprint=False) + if not identity_keys: + continue + + target_group = None + for identity_key in identity_keys: + target_group = indexed_groups.get(identity_key) + if target_group is not None: + break + + if target_group is None: + target_group = [item] + groups.append(target_group) + else: + target_group.append(item) + + for identity_key in identity_keys: + indexed_groups.setdefault(identity_key, target_group) + + return groups + + +def build_track_id(relative_path: str, size_bytes: int | None, modified_at: str | None) -> str: + digest = hashlib.sha1( + f'{relative_path}|{size_bytes or 0}|{modified_at or ""}'.encode('utf-8') + ).hexdigest() + return digest + + +def safe_probe_audio(probe_audio, file_path: str) -> dict: + try: + return probe_audio(file_path) or {} + except Exception: + return {} + + +def default_read_library_tags(file_path: str) -> dict: + mutagen = importlib.import_module('mutagen') + tags_file = mutagen.File(file_path, easy=False) + if tags_file is None or not getattr(tags_file, 'tags', None): + return {} + + normalized = {} + for key, value in tags_file.tags.items(): + normalized[normalize_tag_key(key)] = coerce_tag_value(value) + return normalized + + +def safe_read_tags(read_tags, file_path: str) -> dict: + try: + return read_tags(file_path) or {} + except Exception: + return {} + + +def build_library_metadata(tags: dict, audio_props: dict, file_path: Path) -> dict: + release_date = first_non_empty(tags.get('date'), tags.get('year')) + return { + 'title': first_non_empty(tags.get('title'), file_path.stem), + 'artist': first_non_empty(tags.get('artist'), tags.get('albumartist')), + 'artists': split_artists(tags.get('artist')), + 'album': tags.get('album'), + 'album_artist': first_non_empty(tags.get('albumartist'), tags.get('artist')), + 'track_number': parse_track_number(tags.get('tracknumber')), + 'disc_number': parse_track_number(tags.get('discnumber')), + 'release_date': release_date, + 'year': extract_year(release_date), + 'duration_seconds': audio_props.get('duration_seconds'), + 'recording_id': first_non_empty( + tags.get('musicbrainzrecordingid'), + tags.get('musicbrainztrackid') + ), + 'release_id': first_non_empty( + tags.get('musicbrainzalbumid'), + tags.get('musicbrainzreleaseid') + ), + 'release_group_id': tags.get('musicbrainzreleasegroupid'), + 'source_ids': { + key: value + for key, value in { + 'musicbrainz_recording_id': first_non_empty( + tags.get('musicbrainzrecordingid'), + tags.get('musicbrainztrackid') + ), + 'musicbrainz_release_id': first_non_empty( + tags.get('musicbrainzalbumid'), + tags.get('musicbrainzreleaseid') + ), + 'musicbrainz_release_group_id': tags.get('musicbrainzreleasegroupid') + }.items() + if value + } + } + + +def identity_keys_for_item(item: dict, *, include_fingerprint: bool) -> list[tuple[str, str]]: + metadata = item.get('matched_metadata_json') or {} + keys: list[tuple[str, str]] = [] + + if metadata.get('recording_id'): + keys.append(('recording_id', str(metadata['recording_id']))) + + release_id = metadata.get('release_id') + disc_number = parse_track_number(metadata.get('disc_number')) + track_number = parse_track_number(metadata.get('track_number')) + if release_id and disc_number and track_number: + keys.append(('release_track', f'{release_id}|{disc_number}|{track_number}')) + + title_key = normalize_identity_text(metadata.get('title')) + artist_key = normalize_identity_text( + first_non_empty(metadata.get('artist'), metadata.get('album_artist')) + ) + duration_seconds = first_non_empty( + metadata.get('duration_seconds'), + item.get('fingerprint_duration_seconds'), + (item.get('audio_props_json') or {}).get('duration_seconds') + ) + if title_key and artist_key and duration_seconds not in (None, ''): + title_versions = extract_preserved_version_tokens(metadata.get('title')) + keys.append(( + 'text_duration', + f'{title_key}|{artist_key}|{duration_bucket(duration_seconds)}|{",".join(sorted(title_versions))}' + )) + + if include_fingerprint and not keys: + fingerprint = item.get('acoustic_fingerprint') + if fingerprint and duration_seconds not in (None, ''): + keys.append(('fingerprint_duration', f'{fingerprint}|{duration_bucket(duration_seconds)}')) + + return keys + + +def choose_primary_identity(item: dict) -> tuple[str | None, str | None]: + keys = identity_keys_for_item(item, include_fingerprint=False) + if keys: + return keys[0] + fallback_keys = identity_keys_for_item(item, include_fingerprint=True) + return fallback_keys[0] if fallback_keys else (None, None) + + +def format_timestamp(timestamp: float) -> str: + return ( + datetime.fromtimestamp(timestamp, tz=timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace('+00:00', 'Z') + ) + + +def normalize_identity_text(value: str | None) -> str: + if not value: + return '' + normalized = unicodedata.normalize('NFKC', str(value)).lower() + normalized = re.sub(r'\b(feat|ft|featuring)\.?\b', ' ', normalized) + normalized = re.sub(r'\bversion\b', ' ', normalized) + normalized = re.sub(r'[^a-z0-9]+', ' ', normalized) + return ' '.join(normalized.split()) + + +def extract_preserved_version_tokens(value: str | None) -> set[str]: + normalized = normalize_identity_text(value) + return {token for token in normalized.split() if token in PRESERVED_VERSION_TOKENS} + + +def normalize_tag_key(value: str) -> str: + return re.sub(r'[^a-z0-9]+', '', value.lower()) + + +def coerce_tag_value(value) -> str | None: + if isinstance(value, list): + if not value: + return None + return str(value[0]) + if isinstance(value, bytes): + return value.decode('utf-8', errors='ignore') + return str(value) if value not in (None, '') else None + + +def parse_track_number(value) -> int | None: + if value in (None, ''): + return None + match = re.search(r'\d+', str(value)) + return int(match.group(0)) if match else None + + +def extract_year(value) -> int | None: + if value in (None, ''): + return None + match = re.search(r'(\d{4})', str(value)) + return int(match.group(1)) if match else None + + +def duration_bucket(value) -> int | None: + duration = safe_float(value) + return int(round(duration)) if duration is not None else None + + +def safe_float(value) -> float | None: + if value in (None, ''): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def first_non_empty(*values): + for value in values: + if value not in (None, ''): + return value + return None + + +def split_artists(value: str | None) -> list[str]: + if not value: + return [] + return [part.strip() for part in re.split(r'[,/&;]+', str(value)) if part.strip()] diff --git a/backend/app/library_postprocess.py b/backend/app/library_postprocess.py new file mode 100644 index 0000000..453aec6 --- /dev/null +++ b/backend/app/library_postprocess.py @@ -0,0 +1,1062 @@ +import os +import re +import shutil +import time +import unicodedata +from pathlib import Path + +from . import library_index +from .scanner import ALLOWED_AUDIO_EXTENSIONS +from .task_constants import ( + DEDUPE_PROGRESS_BATCH_SIZE, + DEDUPE_PROGRESS_INTERVAL_SECONDS, + ORGANIZE_PROGRESS_BATCH_SIZE, + ORGANIZE_PROGRESS_INTERVAL_SECONDS, + TASK_STATUS_RUNNING +) + + +LOSSLESS_EXTENSIONS = library_index.LOSSLESS_EXTENSIONS +PRESERVED_VERSION_TOKENS = library_index.PRESERVED_VERSION_TOKENS +REPLACE_SCORE_THRESHOLD = 15.0 +MAX_PATH_COMPONENT_LENGTH = 96 + + +class DedupeItemError(Exception): + def __init__(self, reason: str, message: str): + super().__init__(message) + self.reason = reason + self.message = message + + +class OrganizeItemError(Exception): + def __init__(self, reason: str, message: str): + super().__init__(message) + self.reason = reason + self.message = message + + +class DedupeRunner: + def __init__(self, task_store, preprocessor, task_stream): + self.task_store = task_store + self.preprocessor = preprocessor + self.task_stream = task_stream + + def run(self, task_id: str, current_stats: dict, config_snapshot: dict): + dedupe_stats = current_stats['dedupe'].copy() + candidates = self.task_store.list_dedupe_candidate_items(task_id) + dedupe_stats['input_items'] = len(candidates) + current_stats['dedupe'] = dedupe_stats.copy() + self._persist_progress(task_id, current_stats, dedupe_stats) + + if not candidates: + return + + library_index = self._index_library_files(config_snapshot['output']) + dedupe_stats['library_candidates'] = library_index['count'] + current_stats['dedupe'] = dedupe_stats.copy() + self.task_stream.broadcast_event( + task_id, + 'dedupe.library_indexed', + 'dedupe', + {'count': library_index['count']} + ) + self._append_log( + task_id, + level='info', + event_type='dedupe.library_indexed', + message=f'已索引输出库音频: {library_index["count"]} 个', + payload={'count': library_index['count']} + ) + + groups = _group_batch_candidates(candidates) + winners: list[dict] = [] + processed_count = 0 + last_progress_at = time.monotonic() + + for group in groups: + running_items = [ + self.task_store.update_task_item( + item['id'], + dedupe_status='running', + dedupe_reason=None, + dedupe_message=None, + dedupe_group_key=None, + duplicate_of_path=None, + duplicate_of_item_id=None, + dedupe_decision_json=None + ) + for item in group + ] + winner, batch_duplicates, identity_basis, group_key = _select_batch_winner(running_items) + + for duplicate_item in batch_duplicates: + try: + self.task_stream.broadcast_event( + task_id, + 'dedupe.lookup_started', + 'dedupe', + {'item': duplicate_item} + ) + self._append_log( + task_id, + level='info', + event_type='dedupe.lookup_started', + message=f'开始比对重复项: {duplicate_item["relative_path"]}', + payload={'item': duplicate_item} + ) + dedupe_stats['batch_duplicates'] += 1 + trashed_path = self._move_file_to_trash( + config_snapshot['trash'], + 'duplicates', + task_id, + duplicate_item['id'], + duplicate_item['current_file_path'] + ) + final_item = self.task_store.update_task_item( + duplicate_item['id'], + is_active=0, + current_file_path=trashed_path, + trash_file_path=trashed_path, + dedupe_status='duplicate_trashed', + dedupe_reason='batch_duplicate', + dedupe_message='当前批次中存在更优文件,已移入回收站', + dedupe_group_key=group_key, + duplicate_of_path=winner['current_file_path'], + duplicate_of_item_id=winner['id'], + dedupe_decision_json={ + 'comparison_scope': 'batch', + 'identity_basis': identity_basis, + 'quality_breakdown': { + 'kept': _build_quality_breakdown(winner), + 'trashed': _build_quality_breakdown(duplicate_item) + }, + 'kept_side': 'batch', + 'trashed_path': trashed_path, + 'replaced_existing_path': None, + 'compared_candidates': [ + _serialize_compared_candidate('kept', winner), + _serialize_compared_candidate('trashed', duplicate_item) + ] + } + ) + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_duplicate', + 'dedupe', + {'item': final_item} + ) + self._append_log( + task_id, + level='warning', + event_type='dedupe.item_duplicate', + message=f'批次重复已淘汰: {duplicate_item["relative_path"]}', + payload={'item': final_item} + ) + except DedupeItemError as error: + dedupe_stats['failed_items'] += 1 + final_item = self.task_store.update_task_item( + duplicate_item['id'], + dedupe_status='failed', + dedupe_reason=error.reason, + dedupe_message=error.message, + dedupe_group_key=group_key, + duplicate_of_path=winner['current_file_path'], + duplicate_of_item_id=winner['id'] + ) + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_failed', + 'dedupe', + {'item': final_item} + ) + self._append_log( + task_id, + level='error', + event_type='dedupe.item_failed', + message=f'重复检测失败: {duplicate_item["relative_path"]}', + payload={'item': final_item} + ) + processed_count += 1 + last_progress_at = self._maybe_persist_progress( + task_id, + current_stats, + dedupe_stats, + processed_count, + last_progress_at + ) + + winners.append( + self.task_store.update_task_item( + winner['id'], + dedupe_status='running', + dedupe_group_key=group_key + ) + ) + + replace_enabled = bool( + (config_snapshot.get('advancedStrategy') or {}).get('replaceLowQualityDuplicates') + ) + + for winner in winners: + if not winner['is_active']: + continue + + self.task_stream.broadcast_event( + task_id, + 'dedupe.lookup_started', + 'dedupe', + {'item': winner} + ) + self._append_log( + task_id, + level='info', + event_type='dedupe.lookup_started', + message=f'开始比对重复项: {winner["relative_path"]}', + payload={'item': winner} + ) + identity_basis, identity_key = _choose_primary_identity(winner) + if not identity_basis or not identity_key: + unique_item = self.task_store.update_task_item( + winner['id'], + dedupe_status='unique', + dedupe_reason=None, + dedupe_message='未发现重复项', + dedupe_group_key=winner.get('dedupe_group_key') or f'item:{winner["id"]}', + dedupe_decision_json={ + 'comparison_scope': 'none', + 'identity_basis': None, + 'quality_breakdown': {'kept': _build_quality_breakdown(winner)}, + 'kept_side': 'batch', + 'trashed_path': None, + 'replaced_existing_path': None, + 'compared_candidates': [_serialize_compared_candidate('kept', winner)] + } + ) + dedupe_stats['kept_items'] += 1 + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_unique', + 'dedupe', + {'item': unique_item} + ) + self._append_log( + task_id, + level='success', + event_type='dedupe.item_unique', + message=f'未发现重复项,保留文件: {winner["relative_path"]}', + payload={'item': unique_item} + ) + processed_count += 1 + last_progress_at = self._maybe_persist_progress( + task_id, + current_stats, + dedupe_stats, + processed_count, + last_progress_at + ) + continue + + library_candidates = library_index['by_basis'].get(identity_basis, {}).get(identity_key, []) + if not library_candidates: + unique_item = self.task_store.update_task_item( + winner['id'], + dedupe_status='unique', + dedupe_reason=None, + dedupe_message='未发现库内重复项', + dedupe_group_key=winner.get('dedupe_group_key') or identity_key, + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': identity_basis, + 'quality_breakdown': {'kept': _build_quality_breakdown(winner)}, + 'kept_side': 'batch', + 'trashed_path': None, + 'replaced_existing_path': None, + 'compared_candidates': [_serialize_compared_candidate('kept', winner)] + } + ) + dedupe_stats['kept_items'] += 1 + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_unique', + 'dedupe', + {'item': unique_item} + ) + self._append_log( + task_id, + level='success', + event_type='dedupe.item_unique', + message=f'未发现库内重复项,保留文件: {winner["relative_path"]}', + payload={'item': unique_item} + ) + processed_count += 1 + last_progress_at = self._maybe_persist_progress( + task_id, + current_stats, + dedupe_stats, + processed_count, + last_progress_at + ) + continue + + library_item = max(library_candidates, key=lambda candidate: _build_quality_breakdown(candidate)['total']) + dedupe_stats['library_duplicates'] += 1 + winner_quality = _build_quality_breakdown(winner) + library_quality = _build_quality_breakdown(library_item) + + try: + if replace_enabled and winner_quality['total'] >= library_quality['total'] + REPLACE_SCORE_THRESHOLD: + replaced_path = self._move_file_to_trash( + config_snapshot['trash'], + 'duplicates', + task_id, + winner['id'], + library_item['file_path'] + ) + final_item = self.task_store.update_task_item( + winner['id'], + dedupe_status='duplicate_replaced', + dedupe_reason='replaced_library_duplicate', + dedupe_message='当前文件质量明显更高,已替换库内旧文件', + dedupe_group_key=winner.get('dedupe_group_key') or identity_key, + duplicate_of_path=library_item['file_path'], + duplicate_of_item_id=None, + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': identity_basis, + 'quality_breakdown': { + 'kept': winner_quality, + 'replaced': library_quality + }, + 'kept_side': 'batch', + 'trashed_path': replaced_path, + 'replaced_existing_path': library_item['file_path'], + 'compared_candidates': [ + _serialize_compared_candidate('kept', winner), + _serialize_compared_candidate('replaced', library_item) + ] + } + ) + dedupe_stats['replaced_library_items'] += 1 + dedupe_stats['kept_items'] += 1 + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_replaced', + 'dedupe', + {'item': final_item} + ) + self._append_log( + task_id, + level='success', + event_type='dedupe.item_replaced', + message=f'已替换库内旧文件: {winner["relative_path"]}', + payload={'item': final_item} + ) + else: + trashed_path = self._move_file_to_trash( + config_snapshot['trash'], + 'duplicates', + task_id, + winner['id'], + winner['current_file_path'] + ) + final_item = self.task_store.update_task_item( + winner['id'], + is_active=0, + current_file_path=trashed_path, + trash_file_path=trashed_path, + dedupe_status='duplicate_trashed', + dedupe_reason='library_duplicate', + dedupe_message='输出库中已存在重复文件,保留库内文件', + dedupe_group_key=winner.get('dedupe_group_key') or identity_key, + duplicate_of_path=library_item['file_path'], + duplicate_of_item_id=None, + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': identity_basis, + 'quality_breakdown': { + 'kept': library_quality, + 'trashed': winner_quality + }, + 'kept_side': 'library', + 'trashed_path': trashed_path, + 'replaced_existing_path': None, + 'compared_candidates': [ + _serialize_compared_candidate('kept', library_item), + _serialize_compared_candidate('trashed', winner) + ] + } + ) + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_duplicate', + 'dedupe', + {'item': final_item} + ) + self._append_log( + task_id, + level='warning', + event_type='dedupe.item_duplicate', + message=f'输出库已存在重复文件,已淘汰: {winner["relative_path"]}', + payload={'item': final_item} + ) + except DedupeItemError as error: + dedupe_stats['failed_items'] += 1 + final_item = self.task_store.update_task_item( + winner['id'], + dedupe_status='failed', + dedupe_reason=error.reason, + dedupe_message=error.message, + dedupe_group_key=winner.get('dedupe_group_key') or identity_key, + duplicate_of_path=library_item['file_path'] + ) + self.task_stream.broadcast_event( + task_id, + 'dedupe.item_failed', + 'dedupe', + {'item': final_item} + ) + self._append_log( + task_id, + level='error', + event_type='dedupe.item_failed', + message=f'重复检测失败: {winner["relative_path"]}', + payload={'item': final_item} + ) + + processed_count += 1 + last_progress_at = self._maybe_persist_progress( + task_id, + current_stats, + dedupe_stats, + processed_count, + last_progress_at + ) + + self._persist_progress(task_id, current_stats, dedupe_stats) + + def _index_library_files(self, output_dir: str) -> dict: + return library_index.build_library_index( + output_dir, + probe_audio=self._safe_probe_audio, + read_tags=self._safe_read_library_tags + ) + + def _safe_probe_audio(self, file_path: str) -> dict: + try: + return self.preprocessor.probe_audio(file_path) + except Exception: + return {} + + def _safe_read_library_tags(self, file_path: str) -> dict: + return library_index.safe_read_tags(library_index.default_read_library_tags, file_path) + + def _move_file_to_trash( + self, + trash_root: str, + reason: str, + task_id: str, + item_id: int | None, + source_path: str + ) -> str: + source = Path(source_path) + if not source.exists(): + raise DedupeItemError('source_missing', f'源文件不存在: {source}') + destination = _build_unique_destination( + Path(trash_root) / reason / task_id, + _build_prefixed_name(item_id, source.name) + ) + destination.parent.mkdir(parents=True, exist_ok=True) + try: + self._move_file(source, destination) + except OSError as error: + raise DedupeItemError('trash_move_failed', f'移动到回收站失败: {error}') from error + return str(destination.resolve(strict=False)) + + def _move_file(self, source: Path, destination: Path): + shutil.move(str(source), str(destination)) + + def _persist_progress(self, task_id: str, current_stats: dict, dedupe_stats: dict[str, int]): + current_stats['dedupe'] = dedupe_stats.copy() + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='dedupe', + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'dedupe.progress', + 'dedupe', + {'stats': current_stats} + ) + + def _append_log( + self, + task_id: str, + *, + level: str, + event_type: str, + message: str, + payload: dict | None = None + ): + persisted_log = self.task_store.append_log( + task_id, + 'dedupe', + level, + event_type, + message, + payload + ) + self.task_stream.broadcast_event( + task_id, + 'log.appended', + 'dedupe', + {'log': persisted_log} + ) + + def _maybe_persist_progress( + self, + task_id: str, + current_stats: dict, + dedupe_stats: dict[str, int], + processed_count: int, + last_progress_at: float + ) -> float: + now = time.monotonic() + if ( + processed_count % DEDUPE_PROGRESS_BATCH_SIZE == 0 + or now - last_progress_at >= DEDUPE_PROGRESS_INTERVAL_SECONDS + ): + self._persist_progress(task_id, current_stats, dedupe_stats) + return now + return last_progress_at + + +class OrganizeRunner: + def __init__(self, task_store, task_stream): + self.task_store = task_store + self.task_stream = task_stream + + def run(self, task_id: str, current_stats: dict, config_snapshot: dict): + organize_stats = current_stats['organize'].copy() + candidates = self.task_store.list_organize_candidate_items(task_id) + organize_stats['input_items'] = len(candidates) + current_stats['organize'] = organize_stats.copy() + self._persist_progress(task_id, current_stats, organize_stats) + + if not candidates: + return + + output_root = Path(config_snapshot['output']).expanduser().resolve(strict=False) + trash_root = Path(config_snapshot['trash']).expanduser().resolve(strict=False) + processed_count = 0 + last_progress_at = time.monotonic() + + for original_item in candidates: + item = self.task_store.update_task_item( + original_item['id'], + organize_status='running', + organize_reason=None, + organize_message=None, + library_relative_path=None, + library_file_path=None, + organize_decision_json=None + ) + + try: + plan = _build_organize_plan(output_root, item) + self.task_stream.broadcast_event( + task_id, + 'organize.path_planned', + 'organize', + { + 'item': item, + 'planned_relative_path': plan['planned_relative_path'] + } + ) + self._append_log( + task_id, + level='info', + event_type='organize.path_planned', + message=f'已规划入库路径: {item["relative_path"]}', + payload={ + 'item': item, + 'planned_relative_path': plan['planned_relative_path'] + } + ) + final_path, collision_count = self._resolve_destination( + output_root / plan['planned_relative_path'], + Path(item['current_file_path']) + ) + final_path.parent.mkdir(parents=True, exist_ok=True) + source_path = Path(item['current_file_path']) + if not source_path.exists(): + raise OrganizeItemError('source_missing', f'源文件不存在: {source_path}') + self._move_file(source_path, final_path) + final_relative_path = final_path.relative_to(output_root).as_posix() + renamed = Path(item['current_file_path']).name != final_path.name + moved = Path(item['current_file_path']).resolve(strict=False) != final_path.resolve(strict=False) + + if moved: + organize_stats['moved_items'] += 1 + if renamed: + organize_stats['renamed_items'] += 1 + if collision_count > 1: + organize_stats['collision_resolved'] += 1 + + final_item = self.task_store.update_task_item( + item['id'], + current_file_path=str(final_path.resolve(strict=False)), + filename=final_path.name, + organize_status='organized', + organize_reason=None, + organize_message='已按标准路径入库', + library_relative_path=final_relative_path, + library_file_path=str(final_path.resolve(strict=False)), + organize_decision_json={ + 'source_path': item['current_file_path'], + 'planned_relative_path': plan['planned_relative_path'], + 'final_relative_path': final_relative_path, + 'collision_strategy': 'suffix' if collision_count > 1 else 'none', + 'trashed_on_failure': None, + 'final_action': 'organized' + } + ) + self.task_stream.broadcast_event( + task_id, + 'organize.item_organized', + 'organize', + {'item': final_item} + ) + self._append_log( + task_id, + level='success', + event_type='organize.item_organized', + message=f'文件已入库: {final_relative_path}', + payload={'item': final_item} + ) + except OrganizeItemError as error: + organize_stats['failed_items'] += 1 + final_item = self._handle_failure(task_id, item, error, output_root, trash_root) + if final_item['organize_status'] == 'trashed': + organize_stats['trashed_items'] += 1 + self.task_stream.broadcast_event( + task_id, + 'organize.item_trashed', + 'organize', + {'item': final_item} + ) + self._append_log( + task_id, + level='warning', + event_type='organize.item_trashed', + message=f'入库失败后已移入回收站: {item["relative_path"]}', + payload={'item': final_item} + ) + else: + self.task_stream.broadcast_event( + task_id, + 'organize.item_failed', + 'organize', + {'item': final_item} + ) + self._append_log( + task_id, + level='error', + event_type='organize.item_failed', + message=f'整理入库失败: {item["relative_path"]}', + payload={'item': final_item} + ) + except OSError as error: + organize_stats['failed_items'] += 1 + final_item = self._handle_failure( + task_id, + item, + OrganizeItemError('move_failed', f'整理入库失败: {error}'), + output_root, + trash_root + ) + if final_item['organize_status'] == 'trashed': + organize_stats['trashed_items'] += 1 + self.task_stream.broadcast_event( + task_id, + 'organize.item_trashed', + 'organize', + {'item': final_item} + ) + self._append_log( + task_id, + level='warning', + event_type='organize.item_trashed', + message=f'入库失败后已移入回收站: {item["relative_path"]}', + payload={'item': final_item} + ) + else: + self.task_stream.broadcast_event( + task_id, + 'organize.item_failed', + 'organize', + {'item': final_item} + ) + self._append_log( + task_id, + level='error', + event_type='organize.item_failed', + message=f'整理入库失败: {item["relative_path"]}', + payload={'item': final_item} + ) + + processed_count += 1 + last_progress_at = self._maybe_persist_progress( + task_id, + current_stats, + organize_stats, + processed_count, + last_progress_at + ) + + self._persist_progress(task_id, current_stats, organize_stats) + + def _handle_failure( + self, + task_id: str, + item: dict, + error: OrganizeItemError, + output_root: Path, + trash_root: Path + ) -> dict: + source_path = Path(item['current_file_path']) + trashed_path = None + final_status = 'failed' + message = error.message + + if source_path.exists(): + destination = _build_unique_destination( + trash_root / 'organize_failed' / task_id, + _build_prefixed_name(item['id'], source_path.name) + ) + destination.parent.mkdir(parents=True, exist_ok=True) + try: + self._move_file(source_path, destination) + trashed_path = str(destination.resolve(strict=False)) + final_status = 'trashed' + except OSError as trash_error: + message = f'{error.message}; 移入回收站失败: {trash_error}' + + return self.task_store.update_task_item( + item['id'], + current_file_path=trashed_path or item['current_file_path'], + trash_file_path=trashed_path, + organize_status=final_status, + organize_reason=error.reason, + organize_message=message, + organize_decision_json={ + 'source_path': item['current_file_path'], + 'planned_relative_path': None, + 'final_relative_path': None, + 'collision_strategy': 'none', + 'trashed_on_failure': trashed_path, + 'final_action': final_status + } + ) + + def _resolve_destination(self, desired_path: Path, source_path: Path) -> tuple[Path, int]: + candidate = desired_path + collision_index = 1 + + while candidate.exists(): + if candidate.resolve(strict=False) == source_path.resolve(strict=False): + return candidate, collision_index + collision_index += 1 + candidate = candidate.with_name( + f'{desired_path.stem} ({collision_index}){desired_path.suffix}' + ) + + return candidate, collision_index + + def _move_file(self, source: Path, destination: Path): + shutil.move(str(source), str(destination)) + + def _persist_progress(self, task_id: str, current_stats: dict, organize_stats: dict[str, int]): + current_stats['organize'] = organize_stats.copy() + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='organize', + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'organize.progress', + 'organize', + {'stats': current_stats} + ) + + def _append_log( + self, + task_id: str, + *, + level: str, + event_type: str, + message: str, + payload: dict | None = None + ): + persisted_log = self.task_store.append_log( + task_id, + 'organize', + level, + event_type, + message, + payload + ) + self.task_stream.broadcast_event( + task_id, + 'log.appended', + 'organize', + {'log': persisted_log} + ) + + def _maybe_persist_progress( + self, + task_id: str, + current_stats: dict, + organize_stats: dict[str, int], + processed_count: int, + last_progress_at: float + ) -> float: + now = time.monotonic() + if ( + processed_count % ORGANIZE_PROGRESS_BATCH_SIZE == 0 + or now - last_progress_at >= ORGANIZE_PROGRESS_INTERVAL_SECONDS + ): + self._persist_progress(task_id, current_stats, organize_stats) + return now + return last_progress_at + + +def _group_batch_candidates(items: list[dict]) -> list[list[dict]]: + groups: list[list[dict]] = [] + indexed_groups: dict[tuple[str, str], list[dict]] = {} + + for item in items: + identity_keys = _identity_keys_for_item(item, include_fingerprint=True) + target_group = None + for identity_basis, identity_key in identity_keys: + target_group = indexed_groups.get((identity_basis, identity_key)) + if target_group is not None: + break + if target_group is None: + target_group = [item] + groups.append(target_group) + if identity_keys: + for identity_basis, identity_key in identity_keys: + indexed_groups.setdefault((identity_basis, identity_key), target_group) + else: + indexed_groups[(f'item:{item["id"]}', str(item['id']))] = target_group + else: + target_group.append(item) + + return groups + + +def _select_batch_winner(group: list[dict]) -> tuple[dict, list[dict], str | None, str]: + winner = group[0] + for candidate in group[1:]: + if _compare_batch_candidates(candidate, winner) < 0: + winner = candidate + identity_basis, identity_key = _choose_primary_identity(winner) + return winner, [item for item in group if item['id'] != winner['id']], identity_basis, identity_key or f'item:{winner["id"]}' + + +def _compare_batch_candidates(left: dict, right: dict) -> int: + left_quality = _build_quality_breakdown(left) + right_quality = _build_quality_breakdown(right) + if left_quality['total'] != right_quality['total']: + return -1 if left_quality['total'] > right_quality['total'] else 1 + + left_authority = (1 if left.get('match_is_authoritative') else 0, left.get('match_confidence') or 0) + right_authority = (1 if right.get('match_is_authoritative') else 0, right.get('match_confidence') or 0) + if left_authority != right_authority: + return -1 if left_authority > right_authority else 1 + + left_created = left.get('created_at') or '' + right_created = right.get('created_at') or '' + if left_created != right_created: + return -1 if left_created < right_created else 1 + if left['id'] != right['id']: + return -1 if left['id'] < right['id'] else 1 + return 0 + + +def _build_library_metadata(tags: dict, audio_props: dict, file_path: Path) -> dict: + return library_index.build_library_metadata(tags, audio_props, file_path) + + +def _identity_keys_for_item(item: dict, *, include_fingerprint: bool) -> list[tuple[str, str]]: + return library_index.identity_keys_for_item(item, include_fingerprint=include_fingerprint) + + +def _choose_primary_identity(item: dict) -> tuple[str | None, str | None]: + return library_index.choose_primary_identity(item) + + +def _build_quality_breakdown(item: dict) -> dict: + audio_props = item.get('audio_props_json') or {} + duration_seconds = _first_non_empty( + (item.get('matched_metadata_json') or {}).get('duration_seconds'), + item.get('fingerprint_duration_seconds'), + audio_props.get('duration_seconds') + ) + bit_depth = _safe_float(audio_props.get('bit_depth')) + sample_rate = _safe_float(audio_props.get('sample_rate')) + bitrate = _safe_float(audio_props.get('bitrate')) + channels = _safe_float(audio_props.get('channels')) + size_bytes = _safe_float(item.get('size_bytes')) + extension = (item.get('extension') or Path(item.get('current_file_path') or '').suffix).lower() + is_lossless = extension in LOSSLESS_EXTENSIONS or str(audio_props.get('codec') or '').upper() == 'FLAC' + + breakdown = { + 'bit_depth': round(min((bit_depth or 0) / 24.0, 1.0) * 30.0, 2), + 'sample_rate': round(min((sample_rate or 0) / 96000.0, 1.0) * 20.0, 2), + 'bitrate': round(min((bitrate or 0) / 320000.0, 1.0) * 20.0, 2), + 'lossless': 15.0 if is_lossless else 0.0, + 'channels': round(min((channels or 0) / 2.0, 1.0) * 5.0, 2), + 'size_duration_consistency': 0.0, + 'match_quality': round( + (3.0 if item.get('match_is_authoritative') else 0.0) + + min((_safe_float(item.get('match_confidence')) or 0.0) / 100.0, 1.0) * 2.0, + 2 + ) + } + if size_bytes and duration_seconds and _safe_float(duration_seconds) and size_bytes / _safe_float(duration_seconds) > 1000: + breakdown['size_duration_consistency'] = 5.0 + breakdown['total'] = round(sum(value for key, value in breakdown.items() if key != 'total'), 2) + return breakdown + + +def _serialize_compared_candidate(side: str, item: dict) -> dict: + return { + 'side': side, + 'item_id': item.get('id'), + 'path': item.get('file_path') or item.get('current_file_path'), + 'relative_path': item.get('relative_path'), + 'quality_score': _build_quality_breakdown(item)['total'] + } + + +def _build_organize_plan(output_root: Path, item: dict) -> dict: + metadata = item.get('matched_metadata_json') or {} + album_artist = _sanitize_path_component( + _first_non_empty(metadata.get('album_artist'), metadata.get('artist'), 'Unknown Artist') + ) + if not album_artist: + raise OrganizeItemError('invalid_target_path', '无法生成有效的 Album Artist 目录') + + title = _sanitize_path_component( + _first_non_empty(metadata.get('title'), Path(item.get('current_file_path') or item['filename']).stem) + ) + if not title: + raise OrganizeItemError('invalid_target_path', '无法生成有效的标题文件名') + + year = _extract_year(_first_non_empty(metadata.get('release_date'), metadata.get('year'))) + track_number = _parse_track_number(metadata.get('track_number')) or 1 + disc_number = _parse_track_number(metadata.get('disc_number')) + extension = (item.get('extension') or Path(item.get('current_file_path') or '').suffix).lower() + bucket = _bucket_letter(album_artist) + album = _sanitize_path_component(metadata.get('album')) + + filename = f'{track_number:02d} - {title}{extension}' + if album and album.lower() not in {'single', 'singles'}: + path_parts = [bucket, album_artist, album] + if disc_number and disc_number > 1: + path_parts.append(f'Disc {disc_number}') + path_parts.append(filename) + else: + year_label = str(year) if year else 'Unknown Year' + single_dir = _sanitize_path_component(f'{year_label} - {title}') + path_parts = [bucket, album_artist, 'Singles', single_dir, f'01 - {title}{extension}'] + + planned_relative_path = Path(*path_parts).as_posix() + return { + 'output_root': output_root, + 'planned_relative_path': planned_relative_path + } + + +def _build_prefixed_name(item_id: int | None, filename: str) -> str: + safe_name = _sanitize_path_component(Path(filename).name, fallback='file') + return f'{item_id}_{safe_name}' if item_id is not None else safe_name + + +def _build_unique_destination(directory: Path, filename: str) -> Path: + candidate = directory / filename + if not candidate.exists(): + return candidate + stem = candidate.stem + suffix = candidate.suffix + counter = 2 + while True: + next_candidate = candidate.with_name(f'{stem} ({counter}){suffix}') + if not next_candidate.exists(): + return next_candidate + counter += 1 + + +def _bucket_letter(value: str) -> str: + normalized = unicodedata.normalize('NFKC', value).strip() + if not normalized: + return '#' + first = normalized[0].upper() + return first if first.isalnum() and first.isascii() else '#' + + +def _sanitize_path_component(value: str | None, fallback: str | None = None) -> str: + raw = unicodedata.normalize('NFKC', str(value or fallback or '')).strip() + cleaned = re.sub(r'[\\/:*?"<>|\x00-\x1f]+', ' ', raw) + cleaned = re.sub(r'\s+', ' ', cleaned).strip().rstrip('. ') + if not cleaned: + cleaned = fallback or '' + if len(cleaned) > MAX_PATH_COMPONENT_LENGTH: + cleaned = cleaned[:MAX_PATH_COMPONENT_LENGTH].rstrip('. ') + return cleaned + + +def _normalize_identity_text(value: str | None) -> str: + return library_index.normalize_identity_text(value) + + +def _extract_preserved_version_tokens(value: str | None) -> set[str]: + return library_index.extract_preserved_version_tokens(value) + + +def _normalize_tag_key(value: str) -> str: + return library_index.normalize_tag_key(value) + + +def _coerce_tag_value(value) -> str | None: + return library_index.coerce_tag_value(value) + + +def _parse_track_number(value) -> int | None: + return library_index.parse_track_number(value) + + +def _extract_year(value) -> int | None: + return library_index.extract_year(value) + + +def _duration_bucket(value) -> int | None: + return library_index.duration_bucket(value) + + +def _safe_float(value) -> float | None: + return library_index.safe_float(value) + + +def _first_non_empty(*values): + return library_index.first_non_empty(*values) + + +def _split_artists(value: str | None) -> list[str]: + return library_index.split_artists(value) diff --git a/backend/app/library_service.py b/backend/app/library_service.py new file mode 100644 index 0000000..9e61439 --- /dev/null +++ b/backend/app/library_service.py @@ -0,0 +1,395 @@ +import shutil +from pathlib import Path + +from . import library_index +from .library_postprocess import _build_unique_destination +from .preprocessor import PreprocessItemError +from .task_constants import ( + TASK_STATUS_COMPLETED, + TASK_STATUS_FAILED, + current_timestamp, + create_empty_task_stats, + create_match_failed_stage_states, + create_task_completed_stage_states +) +from .task_store import TaskConflictError + + +DEFAULT_SORT_BY = 'organized_at' +DEFAULT_SORT_ORDER = 'desc' +ALLOWED_SORT_FIELDS = { + 'organized_at', + 'modified_at', + 'filename', + 'title', + 'artist', + 'album', + 'format' +} +ALLOWED_SORT_ORDERS = {'asc', 'desc'} +MANUAL_REQUEUE_MESSAGE = '从音乐库移出,等待重新匹配' + + +class LibraryTrackNotFoundError(Exception): + pass + + +class LibraryService: + def __init__(self, task_store, preprocessor, read_tags=None): + self.task_store = task_store + self.preprocessor = preprocessor + self.read_tags = read_tags or library_index.default_read_library_tags + + def get_summary(self, output_dir: str) -> dict: + scanned_at = current_timestamp() + library_items = self._scan_items(output_dir) + + artists = { + (item.get('matched_metadata_json') or {}).get('artist') + for item in library_items + if (item.get('matched_metadata_json') or {}).get('artist') + } + albums = { + (item.get('matched_metadata_json') or {}).get('album') + for item in library_items + if (item.get('matched_metadata_json') or {}).get('album') + } + + return { + 'total_tracks': len(library_items), + 'total_albums': len(albums), + 'total_artists': len(artists), + 'suspected_duplicates': library_index.count_suspected_duplicates(library_items), + 'scanned_at': scanned_at + } + + def get_tracks_page( + self, + output_dir: str, + *, + q: str | None = None, + artist: str | None = None, + album: str | None = None, + format: str | None = None, + has_provenance: bool | None = None, + page: int = 1, + page_size: int = 50, + sort_by: str = DEFAULT_SORT_BY, + sort_order: str = DEFAULT_SORT_ORDER + ) -> dict: + normalized_sort_by = sort_by if sort_by in ALLOWED_SORT_FIELDS else DEFAULT_SORT_BY + normalized_sort_order = sort_order.lower() if sort_order and sort_order.lower() in ALLOWED_SORT_ORDERS else DEFAULT_SORT_ORDER + tracks = self._build_track_payloads(output_dir) + filtered_tracks = self._filter_tracks( + tracks, + q=q, + artist=artist, + album=album, + format=format, + has_provenance=has_provenance + ) + sorted_tracks = sorted( + filtered_tracks, + key=lambda track: self._sort_key(track, normalized_sort_by), + reverse=normalized_sort_order == 'desc' + ) + + offset = (page - 1) * page_size + return { + 'items': sorted_tracks[offset:offset + page_size], + 'page': page, + 'page_size': page_size, + 'total': len(sorted_tracks) + } + + def move_track_to_exception(self, config_snapshot: dict, track_id: str) -> dict: + output_dir = (config_snapshot.get('output') or '').strip() + trash_dir = (config_snapshot.get('trash') or '').strip() + if not output_dir: + raise ValueError('请先配置输出音乐库目录') + if not trash_dir: + raise ValueError('请先配置回收站目录') + self._ensure_no_active_tasks() + + output_root = Path(output_dir).expanduser().resolve(strict=False) + trash_root = Path(trash_dir).expanduser().resolve(strict=False) + library_item = self._find_item_by_track_id(output_dir, track_id) + if library_item is None: + raise LibraryTrackNotFoundError(track_id) + + source_path = Path(library_item['library_file_path']).expanduser().resolve(strict=False) + if not source_path.exists() or not source_path.is_file(): + raise LibraryTrackNotFoundError(track_id) + if not self._is_relative_to(source_path, output_root): + raise ValueError('目标文件不在输出音乐库目录内') + + audio_props = library_item.get('audio_props_json') or {} + original_tags = self._read_original_tags(str(source_path)) + fingerprint_payload = self._safe_calculate_fingerprint(str(source_path)) + stats = create_empty_task_stats() + stats['scan']['total_found'] = 1 + stats['scan']['queued'] = 1 + stats['preprocess']['input_items'] = 1 + stats['preprocess']['output_items'] = 1 + stats['preprocess']['metadata_snapshots'] = 1 + if fingerprint_payload.get('fingerprint'): + stats['preprocess']['fingerprints_ok'] = 1 + else: + stats['preprocess']['fingerprints_failed'] = 1 + stats['match']['input_items'] = 1 + stats['match']['not_found'] = 1 + + task = self.task_store.create_task_if_idle( + config_snapshot, + trigger_source='manual_library_requeue' + ) + task_id = task['task_id'] + try: + destination = _build_unique_destination( + trash_root / 'match_failed' / task_id, + source_path.name + ) + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(source_path), str(destination)) + trash_file_path = str(destination.resolve(strict=False)) + + item = self.task_store.insert_task_item( + task_id, + original_path=str(source_path), + current_file_path=trash_file_path, + relative_path=library_item['library_relative_path'], + filename=destination.name, + extension=destination.suffix.lower(), + size_bytes=destination.stat().st_size, + modified_at=library_item.get('modified_at'), + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + preprocess_reason=None, + preprocess_message='已从音乐库载入,等待重新匹配', + audio_props_json=audio_props, + original_tags_json=original_tags, + acoustic_fingerprint=fingerprint_payload.get('fingerprint'), + fingerprint_duration_seconds=( + fingerprint_payload.get('duration_seconds') + or library_item.get('fingerprint_duration_seconds') + ), + match_status='not_found', + match_reason='manual_library_requeue', + match_message=MANUAL_REQUEUE_MESSAGE, + dedupe_status='pending', + organize_status='pending', + library_relative_path=library_item['library_relative_path'], + library_file_path=str(source_path), + trash_file_path=trash_file_path, + organize_decision_json={ + 'source_path': str(source_path), + 'trashed_path': trash_file_path, + 'final_action': 'manual_library_requeue' + } + ) + self.task_store.update_task( + task_id, + status=TASK_STATUS_COMPLETED, + current_stage='complete', + stage_states=create_task_completed_stage_states(), + stats=stats, + completed_at=current_timestamp() + ) + self.task_store.append_log( + task_id, + 'match', + 'warning', + 'library.track_requeued', + MANUAL_REQUEUE_MESSAGE, + { + 'track_id': track_id, + 'library_relative_path': library_item['library_relative_path'], + 'trash_file_path': trash_file_path, + 'exception_id': item['id'] + } + ) + except Exception as error: + self.task_store.update_task( + task_id, + status=TASK_STATUS_FAILED, + current_stage='match', + stage_states=create_match_failed_stage_states(), + stats=stats, + error_message=str(error), + completed_at=current_timestamp() + ) + raise + + return { + 'exception_id': item['id'], + 'library_relative_path': library_item['library_relative_path'], + 'trash_file_path': trash_file_path, + 'message': '已移入异常中心,等待重新匹配' + } + + def _build_track_payloads(self, output_dir: str) -> list[dict]: + provenance_by_path, provenance_by_relative_path = self._build_provenance_indexes() + payloads: list[dict] = [] + + for item in self._scan_items(output_dir): + metadata = item.get('matched_metadata_json') or {} + audio_props = item.get('audio_props_json') or {} + provenance = provenance_by_path.get(item['library_file_path']) + if provenance is None: + provenance = provenance_by_relative_path.get(item['library_relative_path']) + + payloads.append( + { + 'track_id': item['track_id'], + 'library_relative_path': item['library_relative_path'], + 'library_file_path': item['library_file_path'], + 'filename': item['filename'], + 'title': metadata.get('title'), + 'artist': metadata.get('artist'), + 'album': metadata.get('album'), + 'album_artist': metadata.get('album_artist'), + 'track_number': metadata.get('track_number'), + 'disc_number': metadata.get('disc_number'), + 'year': metadata.get('year'), + 'duration_seconds': metadata.get('duration_seconds'), + 'format': audio_props.get('format'), + 'codec': audio_props.get('codec'), + 'bitrate': audio_props.get('bitrate'), + 'sample_rate': audio_props.get('sample_rate'), + 'bit_depth': audio_props.get('bit_depth'), + 'channels': audio_props.get('channels'), + 'size_bytes': item.get('size_bytes'), + 'modified_at': item.get('modified_at'), + 'ingest_provenance': provenance + } + ) + + return payloads + + def _build_provenance_indexes(self) -> tuple[dict[str, dict], dict[str, dict]]: + by_path: dict[str, dict] = {} + by_relative_path: dict[str, dict] = {} + + for row in self.task_store.list_library_provenance_items(): + payload = { + 'task_id': row['task_id'], + 'organized_at': row['organized_at'], + 'match_source': row['match_source'], + 'match_confidence': row['match_confidence'], + 'dedupe_status': row['dedupe_status'] + } + + library_file_path = row.get('library_file_path') + library_relative_path = row.get('library_relative_path') + + if library_file_path and library_file_path not in by_path: + by_path[library_file_path] = payload + if library_relative_path and library_relative_path not in by_relative_path: + by_relative_path[library_relative_path] = payload + + return by_path, by_relative_path + + def _scan_items(self, output_dir: str) -> list[dict]: + return library_index.scan_library_items( + output_dir, + probe_audio=self.preprocessor.probe_audio, + read_tags=self.read_tags + ) + + def _find_item_by_track_id(self, output_dir: str, track_id: str) -> dict | None: + return next( + (item for item in self._scan_items(output_dir) if item['track_id'] == track_id), + None + ) + + def _ensure_no_active_tasks(self): + active_ingest = self.task_store.get_active_task('ingest') + if active_ingest is not None: + raise TaskConflictError(active_ingest['task_id']) + + active_repair = self.task_store.get_active_task('repair') + if active_repair is not None: + raise TaskConflictError(active_repair['task_id']) + + def _read_original_tags(self, file_path: str) -> dict: + try: + tags = self.preprocessor.read_tags(file_path) or {} + except Exception: + tags = {} + if tags: + return tags + return library_index.build_library_metadata({}, {}, Path(file_path)) + + def _safe_calculate_fingerprint(self, file_path: str) -> dict: + try: + return self.preprocessor.calculate_fingerprint(file_path) or {} + except (AttributeError, PreprocessItemError, FileNotFoundError, OSError, Exception): + return {} + + def _is_relative_to(self, path: Path, parent: Path) -> bool: + try: + path.relative_to(parent) + return True + except ValueError: + return False + + def _filter_tracks( + self, + tracks: list[dict], + *, + q: str | None, + artist: str | None, + album: str | None, + format: str | None, + has_provenance: bool | None + ) -> list[dict]: + query = (q or '').strip().lower() + artist_filter = (artist or '').strip().lower() + album_filter = (album or '').strip().lower() + format_filter = (format or '').strip().lower() + filtered_tracks: list[dict] = [] + + for track in tracks: + if query and not self._matches_query(track, query): + continue + if artist_filter and (track.get('artist') or '').strip().lower() != artist_filter: + continue + if album_filter and (track.get('album') or '').strip().lower() != album_filter: + continue + if format_filter and (track.get('format') or '').strip().lower() != format_filter: + continue + if has_provenance is True and track.get('ingest_provenance') is None: + continue + if has_provenance is False and track.get('ingest_provenance') is not None: + continue + filtered_tracks.append(track) + + return filtered_tracks + + def _matches_query(self, track: dict, query: str) -> bool: + searchable_fields = ( + track.get('filename'), + track.get('title'), + track.get('artist'), + track.get('album'), + track.get('library_relative_path') + ) + return any(query in str(value).lower() for value in searchable_fields if value) + + def _sort_key(self, track: dict, sort_by: str) -> tuple[int, object]: + if sort_by == 'organized_at': + value = ( + (track.get('ingest_provenance') or {}).get('organized_at') + or track.get('modified_at') + or '' + ) + return (0 if value else 1, value) + + value = track.get(sort_by) + if isinstance(value, str): + value = value.lower() + return (0 if value not in (None, '') else 1, value or '') diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..b2feda9 --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,442 @@ +import asyncio +import mimetypes +import os +import threading +from pathlib import Path + +from fastapi import FastAPI, Query, Request, WebSocket, WebSocketDisconnect +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, StreamingResponse + +from .exception_service import ExceptionItemNotFoundError, ExceptionService +from .matcher import Matcher +from .metadata_status import probe_metadata_services +from .preprocessor import Preprocessor +from .repair_runner import RepairRunner, RepairService +from .library_service import LibraryService, LibraryTrackNotFoundError +from .scanner import Scanner +from .schemas import ( + ConfigPayload, + ConfigSaveResponse, + ExceptionDetailPayload, + ExceptionListResponse, + ExceptionSummaryPayload, + LibraryMoveToExceptionResponse, + LibrarySummaryPayload, + LibraryTracksPageResponse, + MetadataStatusResponse, + RepairExecuteRequest, + RepairPreviewRequest, + RepairPreviewResponse, + RepairTaskCurrentResponse, + RepairTaskRunResponse, + TaskCurrentResponse, + TaskDetailResponse, + TaskHistoryListResponse, + TaskItemsPageResponse, + TaskLogsPageResponse, + TaskRunResponse +) +from .storage import ConfigStore +from .task_runner import TaskRunner +from .task_store import TaskConflictError, TaskNotFoundError, TaskStore +from .task_stream import TaskStreamManager + + +BASE_DIR = Path(__file__).resolve().parent.parent +DEFAULT_DB_PATH = BASE_DIR / 'data' / 'music_workshop.db' +DB_PATH = Path(os.getenv('MUSIC_WORKSHOP_DB_PATH', DEFAULT_DB_PATH)) +store = ConfigStore(DB_PATH) +task_store = TaskStore(DB_PATH) +task_stream = TaskStreamManager() +scanner = Scanner() +preprocessor = Preprocessor() +matcher = Matcher() +library_service = LibraryService(task_store, preprocessor) +exception_service = ExceptionService(task_store) +task_runner = TaskRunner(task_store, scanner, preprocessor, task_stream, matcher) +repair_service = RepairService(task_store, exception_service, matcher, preprocessor, task_stream) +repair_runner = RepairRunner(task_store, task_stream, repair_service) +repair_service.runner = repair_runner + +app = FastAPI(title='Music Workshop API', version='0.1.0') +app.add_middleware( + CORSMiddleware, + allow_origins=[ + 'http://localhost:5173', + 'http://127.0.0.1:5173' + ], + allow_credentials=True, + allow_methods=['*'], + allow_headers=['*'] +) + + +@app.on_event('startup') +async def startup(): + task_stream.set_loop(asyncio.get_running_loop()) + task_store.fail_stale_active_tasks() + + +@app.get('/api/health') +def healthcheck(): + return {'status': 'ok'} + + +@app.get('/api/config', response_model=ConfigPayload) +def get_config(): + return store.get_config() + + +@app.get('/api/config/metadata-status', response_model=MetadataStatusResponse) +def get_config_metadata_status(): + config = store.get_config() + return {'metadataStatus': probe_metadata_services(config['metadata'])} + + +@app.put('/api/config', response_model=ConfigSaveResponse) +def update_config(payload: ConfigPayload): + saved_config = store.save_config(payload.model_dump()) + return { + 'config': saved_config, + 'metadataStatus': probe_metadata_services(saved_config['metadata']) + } + + +@app.post('/api/tasks/run', response_model=TaskRunResponse, status_code=202) +def run_task(): + config_snapshot = store.get_config() + + try: + task = task_store.create_task_if_idle(config_snapshot) + except TaskConflictError as error: + return JSONResponse( + status_code=409, + content={ + 'detail': 'Task already running', + 'task_id': error.active_task_id + } + ) + + threading.Thread( + target=task_runner.start_task, + args=(task['task_id'], config_snapshot), + daemon=True + ).start() + + return { + 'task_id': task['task_id'], + 'status': task['status'], + 'current_stage': task['current_stage'], + 'stage_states': task['stage_states'], + 'started_at': task['started_at'] + } + + +@app.get('/api/tasks/current', response_model=TaskCurrentResponse) +def get_current_task(): + task = task_store.get_active_task() or task_store.get_latest_task() + return {'task': task} + + +@app.get('/api/repair-tasks/current', response_model=RepairTaskCurrentResponse) +def get_current_repair_task(): + task = task_store.get_active_task('repair') or task_store.get_latest_task('repair') + return {'task': task} + + +@app.get('/api/tasks', response_model=TaskHistoryListResponse) +def get_tasks( + page: int = Query(default=1, ge=1), + page_size: int = Query(default=8, ge=1, le=200) +): + return task_store.list_task_history(page, page_size) + + +@app.get('/api/tasks/{task_id}', response_model=TaskDetailResponse) +def get_task(task_id: str): + return {'task': task_store.get_task(task_id)} + + +@app.get('/api/tasks/{task_id}/items', response_model=TaskItemsPageResponse) +def get_task_items( + task_id: str, + scan_status: str | None = None, + preprocess_status: str | None = None, + match_status: str | None = None, + dedupe_status: str | None = None, + organize_status: str | None = None, + active_only: bool = False, + page: int = Query(default=1, ge=1), + page_size: int = Query(default=50, ge=1, le=200) +): + task_store.get_task(task_id) + return task_store.list_task_items( + task_id, + scan_status, + page, + page_size, + preprocess_status=preprocess_status, + match_status=match_status, + dedupe_status=dedupe_status, + organize_status=organize_status, + active_only=active_only + ) + + +@app.get('/api/tasks/{task_id}/logs', response_model=TaskLogsPageResponse) +def get_task_logs( + task_id: str, + page: int = Query(default=1, ge=1), + page_size: int = Query(default=50, ge=1, le=200) +): + task_store.get_task(task_id) + return task_store.list_task_logs(task_id, page, page_size) + + +@app.get('/api/library/summary', response_model=LibrarySummaryPayload) +def get_library_summary(): + config = store.get_config() + return library_service.get_summary(config.get('output') or '') + + +@app.get('/api/library/tracks', response_model=LibraryTracksPageResponse) +def get_library_tracks( + q: str | None = None, + artist: str | None = None, + album: str | None = None, + format: str | None = None, + has_provenance: bool | None = None, + page: int = Query(default=1, ge=1), + page_size: int = Query(default=50, ge=1, le=200), + sort_by: str = Query(default='organized_at'), + sort_order: str = Query(default='desc') +): + config = store.get_config() + return library_service.get_tracks_page( + config.get('output') or '', + q=q, + artist=artist, + album=album, + format=format, + has_provenance=has_provenance, + page=page, + page_size=page_size, + sort_by=sort_by, + sort_order=sort_order + ) + + +@app.post('/api/library/tracks/{track_id}/move-to-exception', response_model=LibraryMoveToExceptionResponse) +def move_library_track_to_exception(track_id: str): + config = store.get_config() + try: + return library_service.move_track_to_exception(config, track_id) + except TaskConflictError as error: + return JSONResponse( + status_code=409, + content={ + 'detail': 'Task already running', + 'task_id': error.active_task_id + } + ) + + +@app.get('/api/exceptions/summary', response_model=ExceptionSummaryPayload) +def get_exception_summary(): + return exception_service.get_summary() + + +@app.get('/api/exceptions/items', response_model=ExceptionListResponse) +def get_exception_items( + type: str = Query(default='all'), + resolution_status: str = Query(default='open'), + page: int = Query(default=1, ge=1), + page_size: int = Query(default=50, ge=1, le=200) +): + return exception_service.get_items(type, page, page_size, resolution_status) + + +@app.get('/api/exceptions/items/{exception_id}', response_model=ExceptionDetailPayload) +def get_exception_item(exception_id: int): + return exception_service.get_item(exception_id) + + +@app.get('/api/exceptions/items/{exception_id}/audio') +def get_exception_item_audio(exception_id: int, request: Request): + audio_path = exception_service.resolve_audio_path(exception_id) + file_size = audio_path.stat().st_size + content_type = mimetypes.guess_type(audio_path.name)[0] or 'application/octet-stream' + range_header = request.headers.get('range') + + if not range_header: + return StreamingResponse( + _iter_file_range(audio_path, 0, file_size - 1), + media_type=content_type, + headers={ + 'Accept-Ranges': 'bytes', + 'Content-Length': str(file_size) + } + ) + + start, end = _parse_range_header(range_header, file_size) + content_length = end - start + 1 + return StreamingResponse( + _iter_file_range(audio_path, start, end), + status_code=206, + media_type=content_type, + headers={ + 'Accept-Ranges': 'bytes', + 'Content-Length': str(content_length), + 'Content-Range': f'bytes {start}-{end}/{file_size}' + } + ) + + +@app.post('/api/exceptions/actions/preview', response_model=RepairPreviewResponse) +def preview_exception_action(payload: RepairPreviewRequest): + config_snapshot = store.get_config() + return repair_service.preview(payload.model_dump(), config_snapshot) + + +@app.post('/api/exceptions/actions/execute', response_model=RepairTaskRunResponse, status_code=202) +def execute_exception_action(payload: RepairExecuteRequest): + config_snapshot = store.get_config() + try: + task = repair_service.execute(payload.model_dump(), config_snapshot) + except TaskConflictError as error: + return JSONResponse( + status_code=409, + content={'detail': 'Repair task already running', 'task_id': error.active_task_id} + ) + + threading.Thread( + target=repair_runner.start_task, + args=(task['task_id'], config_snapshot), + daemon=True + ).start() + + return { + 'repair_task_id': task['task_id'], + 'status': task['status'], + 'current_stage': task['current_stage'], + 'stage_states': task['stage_states'], + 'started_at': task['started_at'] + } + + +@app.get('/api/repair-tasks/{task_id}', response_model=TaskDetailResponse) +def get_repair_task(task_id: str): + task = task_store.get_task(task_id) + if task.get('task_type') != 'repair': + raise TaskNotFoundError(task_id) + return {'task': task} + + +@app.get('/api/repair-tasks/{task_id}/logs', response_model=TaskLogsPageResponse) +def get_repair_task_logs( + task_id: str, + page: int = Query(default=1, ge=1), + page_size: int = Query(default=50, ge=1, le=200) +): + task = task_store.get_task(task_id) + if task.get('task_type') != 'repair': + raise TaskNotFoundError(task_id) + return task_store.list_task_logs(task_id, page, page_size) + + +@app.websocket('/api/tasks/{task_id}/stream') +async def stream_task(task_id: str, websocket: WebSocket): + try: + snapshot = task_store.get_task_snapshot(task_id) + except TaskNotFoundError: + await websocket.accept() + await websocket.close(code=4404) + return + + await task_stream.connect(task_id, websocket) + + try: + await websocket.send_json( + { + 'type': 'task.snapshot', + 'task_id': task_id, + 'stage': snapshot['task']['current_stage'], + 'timestamp': snapshot['task']['updated_at'], + 'data': snapshot + } + ) + while True: + await websocket.receive_text() + except WebSocketDisconnect: + task_stream.disconnect(task_id, websocket) + except Exception: + task_stream.disconnect(task_id, websocket) + raise + + +@app.websocket('/api/repair-tasks/{task_id}/stream') +async def stream_repair_task(task_id: str, websocket: WebSocket): + await stream_task(task_id, websocket) + + +@app.exception_handler(ValueError) +def value_error_handler(_, exc: ValueError): + return JSONResponse(status_code=400, content={'detail': str(exc)}) + + +@app.exception_handler(TaskNotFoundError) +def task_not_found_error_handler(_, exc: TaskNotFoundError): + return JSONResponse(status_code=404, content={'detail': f'Task not found: {exc}'}) + + +@app.exception_handler(ExceptionItemNotFoundError) +def exception_item_not_found_error_handler(_, exc: ExceptionItemNotFoundError): + return JSONResponse(status_code=404, content={'detail': f'Exception item not found: {exc}'}) + + +@app.exception_handler(LibraryTrackNotFoundError) +def library_track_not_found_error_handler(_, exc: LibraryTrackNotFoundError): + return JSONResponse(status_code=404, content={'detail': f'Library track not found: {exc}'}) + + +@app.exception_handler(FileNotFoundError) +def file_not_found_error_handler(_, exc: FileNotFoundError): + return JSONResponse(status_code=404, content={'detail': str(exc)}) + + +def _parse_range_header(header_value: str, file_size: int) -> tuple[int, int]: + if not header_value.startswith('bytes='): + raise ValueError('Invalid Range header') + + range_value = header_value[6:].strip() + start_text, _, end_text = range_value.partition('-') + if not start_text and not end_text: + raise ValueError('Invalid Range header') + + if start_text: + start = int(start_text) + end = int(end_text) if end_text else file_size - 1 + else: + suffix_length = int(end_text) + if suffix_length <= 0: + raise ValueError('Invalid Range header') + start = max(file_size - suffix_length, 0) + end = file_size - 1 + + if start < 0 or end < start or start >= file_size: + raise ValueError('Invalid Range header') + + return start, min(end, file_size - 1) + + +def _iter_file_range(file_path: Path, start: int, end: int, chunk_size: int = 64 * 1024): + with file_path.open('rb') as file_handle: + file_handle.seek(start) + remaining = end - start + 1 + while remaining > 0: + chunk = file_handle.read(min(chunk_size, remaining)) + if not chunk: + break + remaining -= len(chunk) + yield chunk diff --git a/backend/app/matcher.py b/backend/app/matcher.py new file mode 100644 index 0000000..ef87735 --- /dev/null +++ b/backend/app/matcher.py @@ -0,0 +1,1483 @@ +import base64 +import json +import re +import time +from pathlib import Path +from urllib import error, parse, request + + +MUSICBRAINZ_USER_AGENT = 'MusicWorkshop/0.1 (+https://example.invalid/musicworkshop)' +MUSICBRAINZ_THROTTLE_SECONDS = 1.0 +HTTP_TIMEOUT_SECONDS = 5 +HTTP_RETRY_COUNT = 1 +MAX_MATCH_CANDIDATES = 5 +VERSION_TOKENS = { + 'live', + 'remix', + 'demo', + 'karaoke', + 'instrumental', + 'cover', + 'acoustic', + 'edit', + 'version' +} +SPOTIFY_TOKEN_URL = 'https://accounts.spotify.com/api/token' + + +class MatchProviderError(Exception): + def __init__(self, provider: str, message: str): + super().__init__(message) + self.provider = provider + + +class MatchHttpClient: + def __init__(self): + self._last_request_at: dict[str, float] = {} + + def request_json( + self, + provider: str, + url: str, + *, + params: dict | None = None, + headers: dict[str, str] | None = None, + method: str = 'GET', + data: bytes | None = None, + timeout: int = HTTP_TIMEOUT_SECONDS, + retries: int = HTTP_RETRY_COUNT, + throttle_key: str | None = None, + throttle_seconds: float = 0.0 + ) -> dict: + request_headers = dict(headers or {}) + final_url = _append_query_params(url, params) + + if throttle_key and throttle_seconds > 0: + self._throttle(throttle_key, throttle_seconds) + + for attempt in range(retries + 1): + try: + response = self._open( + request.Request( + final_url, + data=data, + headers=request_headers, + method=method + ), + timeout + ) + with response: + charset = response.headers.get_content_charset() or 'utf-8' + payload = response.read().decode(charset) + return json.loads(payload) if payload else {} + except error.HTTPError as exc: + response_body = exc.read().decode('utf-8', errors='ignore') + if exc.code >= 500 and attempt < retries: + time.sleep(0.2 * (attempt + 1)) + continue + raise MatchProviderError( + provider, + f'{provider} 请求失败 (HTTP {exc.code}) {response_body[:160]}'.strip() + ) from exc + except error.URLError as exc: + if attempt < retries: + time.sleep(0.2 * (attempt + 1)) + continue + raise MatchProviderError( + provider, + f'{provider} 无法连接: {getattr(exc, "reason", exc)}' + ) from exc + except TimeoutError as exc: + if attempt < retries: + time.sleep(0.2 * (attempt + 1)) + continue + raise MatchProviderError(provider, f'{provider} 请求超时') from exc + except json.JSONDecodeError as exc: + raise MatchProviderError(provider, f'{provider} 返回了无效 JSON') from exc + + raise MatchProviderError(provider, f'{provider} 请求失败') + + def _open(self, req: request.Request, timeout: int): + return request.urlopen(req, timeout=timeout) + + def _throttle(self, throttle_key: str, throttle_seconds: float): + now = time.monotonic() + previous = self._last_request_at.get(throttle_key) + if previous is not None: + sleep_seconds = throttle_seconds - (now - previous) + if sleep_seconds > 0: + time.sleep(sleep_seconds) + self._last_request_at[throttle_key] = time.monotonic() + + +class MusicBrainzProvider: + def __init__(self, http_client: MatchHttpClient): + self.http_client = http_client + self._recording_cache: dict[tuple[str, str], dict] = {} + self._release_cache: dict[tuple[str, str], dict] = {} + + def search_text( + self, + item_metadata: dict, + config: dict, + *, + provider_name: str = 'musicbrainz' + ) -> list[dict]: + base_url = _normalize_base_url(config.get('musicbrainz')) + query = _build_text_query(item_metadata) + if not base_url or not query: + return [] + + response = self._request_json( + provider_name, + f'{base_url}/recording', + params={ + 'query': query, + 'limit': MAX_MATCH_CANDIDATES, + 'fmt': 'json' + } + ) + candidates: list[dict] = [] + + for entry in response.get('recordings') or []: + recording_id = entry.get('id') + if not recording_id: + continue + score = _to_float(entry.get('score') or entry.get('ext:score')) / 100.0 + release_ids = [ + release.get('id') + for release in entry.get('releases') or [] + if release.get('id') + ] + candidates.extend( + self.lookup_recording_seed( + provider_name, + config, + recording_id=recording_id, + release_ids=release_ids, + search_confidence=score + ) + ) + + return _dedupe_candidates(candidates) + + def lookup_recording_seed( + self, + provider_name: str, + config: dict, + *, + recording_id: str, + release_ids: list[str] | None = None, + search_confidence: float | None = None, + fingerprint_confidence: float | None = None, + extra_source_ids: dict | None = None + ) -> list[dict]: + recording = self._get_recording(config, recording_id) + candidate_release_ids = _unique_non_empty( + [*(release_ids or []), *[ + release.get('id') + for release in recording.get('releases') or [] + if release.get('id') + ]] + ) + candidates: list[dict] = [] + + for release_id in candidate_release_ids[:2]: + release = self._get_release(config, release_id) + candidate = self._build_release_candidate( + provider_name, + recording, + release, + search_confidence=search_confidence, + fingerprint_confidence=fingerprint_confidence, + extra_source_ids=extra_source_ids + ) + if candidate: + candidates.append(candidate) + + if candidates: + return _dedupe_candidates(candidates) + + fallback = self._build_recording_candidate( + provider_name, + recording, + search_confidence=search_confidence, + fingerprint_confidence=fingerprint_confidence, + extra_source_ids=extra_source_ids + ) + return [fallback] if fallback else [] + + def align_candidate(self, candidate: dict, config: dict) -> dict | None: + item_metadata = { + 'title': candidate.get('title'), + 'artist': candidate.get('artist'), + 'album': candidate.get('album'), + 'duration_seconds': candidate.get('duration_seconds'), + 'track_number': candidate.get('track_number'), + 'disc_number': candidate.get('disc_number') + } + aligned_candidates = self.search_text(item_metadata, config, provider_name='musicbrainz') + return aligned_candidates[0] if aligned_candidates else None + + def _get_recording(self, config: dict, recording_id: str) -> dict: + base_url = _normalize_base_url(config.get('musicbrainz')) + cache_key = (base_url, recording_id) + if cache_key not in self._recording_cache: + self._recording_cache[cache_key] = self._request_json( + 'musicbrainz', + f'{base_url}/recording/{recording_id}', + params={ + 'inc': 'artists+releases', + 'fmt': 'json' + } + ) + return self._recording_cache[cache_key] + + def _get_release(self, config: dict, release_id: str) -> dict: + base_url = _normalize_base_url(config.get('musicbrainz')) + cache_key = (base_url, release_id) + if cache_key not in self._release_cache: + self._release_cache[cache_key] = self._request_json( + 'musicbrainz', + f'{base_url}/release/{release_id}', + params={ + 'inc': 'artists+recordings+release-groups', + 'fmt': 'json' + } + ) + return self._release_cache[cache_key] + + def _request_json(self, provider: str, url: str, *, params: dict | None = None) -> dict: + return self.http_client.request_json( + provider, + url, + params=params, + headers={'User-Agent': MUSICBRAINZ_USER_AGENT}, + throttle_key='musicbrainz', + throttle_seconds=MUSICBRAINZ_THROTTLE_SECONDS + ) + + def _build_recording_candidate( + self, + provider_name: str, + recording: dict, + *, + search_confidence: float | None, + fingerprint_confidence: float | None, + extra_source_ids: dict | None + ) -> dict | None: + recording_id = recording.get('id') + if not recording_id: + return None + + title = recording.get('title') + artists = _extract_artist_names(recording.get('artist-credit') or []) + duration_seconds = _milliseconds_to_seconds(recording.get('length')) + release_date = None + + return { + 'provider': provider_name, + 'is_authoritative': True, + 'title': title, + 'artist': _join_artists(artists), + 'artists': artists, + 'album': None, + 'album_artist': _join_artists(artists), + 'track_number': None, + 'disc_number': None, + 'release_date': release_date, + 'year': _extract_year(release_date), + 'duration_seconds': duration_seconds, + 'recording_id': recording_id, + 'release_id': None, + 'release_group_id': None, + 'source_ids': { + **(extra_source_ids or {}), + 'musicbrainz_recording_id': recording_id + }, + 'fingerprint_confidence': fingerprint_confidence, + 'search_confidence': search_confidence, + 'release_tracklist': [] + } + + def _build_release_candidate( + self, + provider_name: str, + recording: dict, + release: dict, + *, + search_confidence: float | None, + fingerprint_confidence: float | None, + extra_source_ids: dict | None + ) -> dict | None: + recording_id = recording.get('id') + release_id = release.get('id') + if not recording_id or not release_id: + return None + + track_info = _find_release_track(release, recording_id) + track_title = track_info.get('title') or track_info.get('recording', {}).get('title') + track_artist_credit = ( + track_info.get('recording', {}).get('artist-credit') + or recording.get('artist-credit') + or release.get('artist-credit') + or [] + ) + artists = _extract_artist_names(track_artist_credit) + album_artists = _extract_artist_names(release.get('artist-credit') or []) + release_date = release.get('date') + duration_seconds = _milliseconds_to_seconds( + track_info.get('length') or recording.get('length') + ) + + return { + 'provider': provider_name, + 'is_authoritative': True, + 'title': track_title or recording.get('title'), + 'artist': _join_artists(artists or album_artists), + 'artists': artists or album_artists, + 'album': release.get('title'), + 'album_artist': _join_artists(album_artists or artists), + 'track_number': _parse_track_number(track_info.get('position') or track_info.get('number')), + 'disc_number': _parse_track_number(track_info.get('disc_number')), + 'release_date': release_date, + 'year': _extract_year(release_date), + 'duration_seconds': duration_seconds, + 'recording_id': recording_id, + 'release_id': release_id, + 'release_group_id': (release.get('release-group') or {}).get('id'), + 'source_ids': { + **(extra_source_ids or {}), + 'musicbrainz_recording_id': recording_id, + 'musicbrainz_release_id': release_id, + 'musicbrainz_release_group_id': (release.get('release-group') or {}).get('id') + }, + 'fingerprint_confidence': fingerprint_confidence, + 'search_confidence': search_confidence, + 'release_tracklist': _build_release_tracklist(release) + } + + +class AcoustIdProvider: + def __init__(self, http_client: MatchHttpClient, musicbrainz_provider: MusicBrainzProvider): + self.http_client = http_client + self.musicbrainz_provider = musicbrainz_provider + + def search(self, item: dict, config: dict) -> list[dict]: + metadata_config = config.get('metadata') or {} + fingerprint = item.get('acoustic_fingerprint') + duration_seconds = item.get('fingerprint_duration_seconds') + client_key = (metadata_config.get('acoustidClientKey') or '').strip() + base_url = _normalize_base_url(metadata_config.get('acoustidUrl')) + + if not fingerprint or not duration_seconds or not client_key or not base_url: + return [] + + response = self.http_client.request_json( + 'acoustid', + f'{base_url}/lookup', + params={ + 'client': client_key, + 'duration': int(round(duration_seconds)), + 'fingerprint': fingerprint, + 'meta': 'recordings releasegroups' + } + ) + candidates: list[dict] = [] + + for result in response.get('results') or []: + result_id = result.get('id') + fingerprint_confidence = _to_float(result.get('score')) + for recording in result.get('recordings') or []: + recording_id = recording.get('id') + if not recording_id: + continue + release_ids = [ + release.get('id') + for release in recording.get('releases') or [] + if release.get('id') + ] + candidates.extend( + self.musicbrainz_provider.lookup_recording_seed( + 'acoustid', + metadata_config, + recording_id=recording_id, + release_ids=release_ids, + fingerprint_confidence=fingerprint_confidence, + extra_source_ids={'acoustid_id': result_id} + ) + ) + + return _dedupe_candidates(candidates) + + +class TextSearchProvider: + provider_name = '' + credentials: tuple[str, ...] = () + + def __init__(self, http_client: MatchHttpClient): + self.http_client = http_client + + def search(self, item_metadata: dict, config: dict) -> list[dict]: + query = _build_text_query(item_metadata) + metadata_config = config.get('metadata') or {} + base_url = _normalize_base_url(metadata_config.get(self.base_url_key)) + + if not base_url or not query: + return [] + if self.credentials and not all( + (metadata_config.get(field_name) or '').strip() + for field_name in self.credentials + ): + return [] + + return self._parse_search_payload( + self.http_client.request_json( + self.provider_name, + f'{base_url}{self.search_path}', + params=self.build_params(query) + ) + ) + + def build_params(self, query: str) -> dict: + return {'keywords': query, 'limit': MAX_MATCH_CANDIDATES} + + def _parse_search_payload(self, payload: dict) -> list[dict]: + raise NotImplementedError + + +class NeteaseProvider(TextSearchProvider): + provider_name = 'netease' + base_url_key = 'netease' + search_path = '/search' + + def build_params(self, query: str) -> dict: + return { + 'keywords': query, + 'type': 1, + 'limit': MAX_MATCH_CANDIDATES + } + + def _parse_search_payload(self, payload: dict) -> list[dict]: + songs = ((payload.get('result') or {}).get('songs')) or [] + candidates = [] + for song in songs[:MAX_MATCH_CANDIDATES]: + artists = [artist.get('name') for artist in song.get('ar') or [] if artist.get('name')] + album = song.get('al') or {} + release_date = _format_timestamp_date(album.get('publishTime')) + candidates.append( + { + 'provider': 'netease', + 'is_authoritative': False, + 'title': song.get('name'), + 'artist': _join_artists(artists), + 'artists': artists, + 'album': album.get('name'), + 'album_artist': _join_artists(artists), + 'track_number': None, + 'disc_number': None, + 'release_date': release_date, + 'year': _extract_year(release_date), + 'duration_seconds': _milliseconds_to_seconds(song.get('dt')), + 'recording_id': None, + 'release_id': None, + 'release_group_id': None, + 'source_ids': { + 'netease_song_id': song.get('id'), + 'netease_album_id': album.get('id') + }, + 'cover_url': album.get('picUrl'), + 'search_confidence': 0.88 + } + ) + return candidates + + +class QQProvider(TextSearchProvider): + provider_name = 'qq' + base_url_key = 'qq' + search_path = '/search' + + def _parse_search_payload(self, payload: dict) -> list[dict]: + song_list = ( + ((payload.get('data') or {}).get('song') or {}).get('list') + or ((payload.get('result') or {}).get('list') or []) + ) + candidates = [] + for song in song_list[:MAX_MATCH_CANDIDATES]: + singers = song.get('singer') or song.get('singers') or [] + artists = [artist.get('name') for artist in singers if artist.get('name')] + album = song.get('album') or {} + release_date = song.get('time_public') or album.get('time_public') + candidates.append( + { + 'provider': 'qq', + 'is_authoritative': False, + 'title': song.get('title') or song.get('name'), + 'artist': _join_artists(artists), + 'artists': artists, + 'album': album.get('title') or album.get('name'), + 'album_artist': _join_artists(artists), + 'track_number': None, + 'disc_number': None, + 'release_date': release_date, + 'year': _extract_year(release_date), + 'duration_seconds': _milliseconds_to_seconds(song.get('interval', 0) * 1000), + 'recording_id': None, + 'release_id': None, + 'release_group_id': None, + 'source_ids': { + 'qq_song_mid': song.get('mid') or song.get('songmid'), + 'qq_album_mid': album.get('mid') + }, + 'search_confidence': 0.88 + } + ) + return candidates + + +class SpotifyProvider(TextSearchProvider): + provider_name = 'spotify' + base_url_key = 'spotifyUrl' + search_path = '/search' + credentials = ('spotifyClientId', 'spotifySecret') + + def __init__(self, http_client: MatchHttpClient): + super().__init__(http_client) + self._token_cache: dict[str, dict] = {} + + def search(self, item_metadata: dict, config: dict) -> list[dict]: + query = _build_text_query(item_metadata) + metadata_config = config.get('metadata') or {} + base_url = _normalize_base_url(metadata_config.get(self.base_url_key)) + client_id = (metadata_config.get('spotifyClientId') or '').strip() + client_secret = (metadata_config.get('spotifySecret') or '').strip() + + if not base_url or not query or not client_id or not client_secret: + return [] + + token = self._get_access_token(client_id, client_secret) + payload = self.http_client.request_json( + 'spotify', + f'{base_url}{self.search_path}', + params={ + 'q': query, + 'type': 'track', + 'limit': MAX_MATCH_CANDIDATES + }, + headers={'Authorization': f'Bearer {token}'} + ) + candidates = [] + tracks = ((payload.get('tracks') or {}).get('items')) or [] + for track in tracks[:MAX_MATCH_CANDIDATES]: + artists = [artist.get('name') for artist in track.get('artists') or [] if artist.get('name')] + album = track.get('album') or {} + images = album.get('images') or [] + candidates.append( + { + 'provider': 'spotify', + 'is_authoritative': False, + 'title': track.get('name'), + 'artist': _join_artists(artists), + 'artists': artists, + 'album': album.get('name'), + 'album_artist': _join_artists(artists), + 'track_number': track.get('track_number'), + 'disc_number': track.get('disc_number'), + 'release_date': album.get('release_date'), + 'year': _extract_year(album.get('release_date')), + 'duration_seconds': _milliseconds_to_seconds(track.get('duration_ms')), + 'recording_id': None, + 'release_id': None, + 'release_group_id': None, + 'source_ids': { + 'spotify_track_id': track.get('id'), + 'spotify_album_id': album.get('id') + }, + 'cover_url': images[0].get('url') if images else None, + 'search_confidence': 0.9 + } + ) + return candidates + + def _get_access_token(self, client_id: str, client_secret: str) -> str: + cache_key = f'{client_id}:{client_secret}' + cached_token = self._token_cache.get(cache_key) + if cached_token and cached_token['expires_at'] > time.time(): + return cached_token['access_token'] + + basic_token = base64.b64encode(f'{client_id}:{client_secret}'.encode()).decode() + payload = self.http_client.request_json( + 'spotify', + SPOTIFY_TOKEN_URL, + method='POST', + data=b'grant_type=client_credentials', + headers={ + 'Authorization': f'Basic {basic_token}', + 'Content-Type': 'application/x-www-form-urlencoded' + } + ) + access_token = payload.get('access_token') + expires_in = int(payload.get('expires_in') or 3600) + self._token_cache[cache_key] = { + 'access_token': access_token, + 'expires_at': time.time() + max(60, expires_in - 30) + } + return access_token + + +class DiscogsProvider: + def __init__(self, http_client: MatchHttpClient): + self.http_client = http_client + + def enrich(self, metadata: dict, config: dict) -> dict | None: + metadata_config = config.get('metadata') or {} + base_url = _normalize_base_url(metadata_config.get('discogsUrl')) + token = (metadata_config.get('discogsToken') or '').strip() + if not base_url or not token: + return None + + payload = self.http_client.request_json( + 'discogs', + f'{base_url}/database/search', + params={ + 'track': metadata.get('title') or '', + 'artist': metadata.get('artist') or '', + 'release_title': metadata.get('album') or '', + 'per_page': 3, + 'token': token + } + ) + result = (payload.get('results') or [None])[0] + if not result: + return None + + return { + 'provider': 'discogs', + 'cover_url': result.get('cover_image'), + 'genres': result.get('genre') or [], + 'tags': result.get('style') or [], + 'quality': 0.9 if result.get('cover_image') else 0.5, + 'source_id': result.get('id') + } + + +class LastFmProvider: + def __init__(self, http_client: MatchHttpClient): + self.http_client = http_client + + def enrich(self, metadata: dict, config: dict) -> dict | None: + metadata_config = config.get('metadata') or {} + base_url = _normalize_base_url(metadata_config.get('lastfmUrl')) + api_key = (metadata_config.get('lastfmKey') or '').strip() + if not base_url or not api_key: + return None + + payload = self.http_client.request_json( + 'lastfm', + base_url, + params={ + 'method': 'track.getInfo', + 'api_key': api_key, + 'artist': metadata.get('artist') or '', + 'track': metadata.get('title') or '', + 'autocorrect': 1, + 'format': 'json' + } + ) + track = payload.get('track') or {} + album = track.get('album') or {} + images = album.get('image') or [] + top_tags = ((track.get('toptags') or {}).get('tag')) or [] + return { + 'provider': 'lastfm', + 'cover_url': next( + (image.get('#text') for image in reversed(images) if image.get('#text')), + None + ), + 'genres': [tag.get('name') for tag in top_tags if tag.get('name')], + 'tags': [tag.get('name') for tag in top_tags if tag.get('name')], + 'quality': 0.75 if images else 0.6, + 'source_id': track.get('mbid') or metadata.get('recording_id') + } + + +class GeniusProvider: + def __init__(self, http_client: MatchHttpClient): + self.http_client = http_client + + def enrich(self, metadata: dict, config: dict) -> dict | None: + metadata_config = config.get('metadata') or {} + base_url = _normalize_base_url(metadata_config.get('geniusUrl')) + token = (metadata_config.get('geniusToken') or '').strip() + query = _build_text_query(metadata) + if not base_url or not token or not query: + return None + + payload = self.http_client.request_json( + 'genius', + f'{base_url}/search', + params={'q': query}, + headers={'Authorization': f'Bearer {token}'} + ) + hit = (((payload.get('response') or {}).get('hits')) or [None])[0] + if not hit: + return None + + result = hit.get('result') or {} + return { + 'provider': 'genius', + 'lyrics_url': result.get('url'), + 'quality': 0.8 if result.get('url') else 0.0, + 'source_id': result.get('id') + } + + +class Matcher: + def __init__( + self, + *, + http_client: MatchHttpClient | None = None, + musicbrainz_provider: MusicBrainzProvider | None = None, + acoustid_provider: AcoustIdProvider | None = None, + netease_provider: NeteaseProvider | None = None, + qq_provider: QQProvider | None = None, + spotify_provider: SpotifyProvider | None = None, + discogs_provider: DiscogsProvider | None = None, + lastfm_provider: LastFmProvider | None = None, + genius_provider: GeniusProvider | None = None + ): + self.http_client = http_client or MatchHttpClient() + self.musicbrainz_provider = musicbrainz_provider or MusicBrainzProvider(self.http_client) + self.acoustid_provider = acoustid_provider or AcoustIdProvider( + self.http_client, + self.musicbrainz_provider + ) + self.netease_provider = netease_provider or NeteaseProvider(self.http_client) + self.qq_provider = qq_provider or QQProvider(self.http_client) + self.spotify_provider = spotify_provider or SpotifyProvider(self.http_client) + self.discogs_provider = discogs_provider or DiscogsProvider(self.http_client) + self.lastfm_provider = lastfm_provider or LastFmProvider(self.http_client) + self.genius_provider = genius_provider or GeniusProvider(self.http_client) + + def match_item(self, item: dict, album_group: list[dict], config: dict) -> dict: + item_metadata = _build_input_metadata(item) + provider_warnings: list[dict] = [] + candidates: list[dict] = [] + provider_scope = set(config.get('repair_provider_scope') or []) + use_all_providers = not provider_scope + + def provider_enabled(name: str) -> bool: + return use_all_providers or name in provider_scope + + if provider_enabled('acoustid'): + candidates.extend( + self._collect_provider_candidates( + 'acoustid', + self.acoustid_provider.search, + provider_warnings, + item, + config + ) + ) + if provider_enabled('musicbrainz'): + candidates.extend( + self._collect_provider_candidates( + 'musicbrainz', + self.musicbrainz_provider.search_text, + provider_warnings, + item_metadata, + config.get('metadata') or {} + ) + ) + candidates = self._score_candidates(item_metadata, album_group, candidates) + + top_authoritative = candidates[0] if candidates else None + fallback_enabled = bool((config.get('advancedStrategy') or {}).get('metadataFallback', True)) + if fallback_enabled and (top_authoritative is None or top_authoritative['score'] < 85): + fallback_candidates = [] + if provider_enabled('netease'): + fallback_candidates.extend( + self._collect_provider_candidates( + 'netease', + self.netease_provider.search, + provider_warnings, + item_metadata, + config + ) + ) + if provider_enabled('qq'): + fallback_candidates.extend( + self._collect_provider_candidates( + 'qq', + self.qq_provider.search, + provider_warnings, + item_metadata, + config + ) + ) + if provider_enabled('spotify'): + fallback_candidates.extend( + self._collect_provider_candidates( + 'spotify', + self.spotify_provider.search, + provider_warnings, + item_metadata, + config + ) + ) + candidates = self._score_candidates( + item_metadata, + album_group, + [*candidates, *fallback_candidates] + ) + + if not candidates: + return { + 'status': 'not_found', + 'reason': 'no_candidates', + 'message': '未找到任何匹配候选', + 'source': None, + 'confidence': None, + 'is_authoritative': False, + 'matched_metadata_json': None, + 'match_candidates_json': [], + 'match_enrichment_json': None, + 'provider_warnings': provider_warnings + } + + top_candidate = candidates[0] + if not top_candidate['is_authoritative'] and provider_enabled('musicbrainz'): + aligned_candidate = self._align_candidate_with_warnings( + top_candidate, + config.get('metadata') or {}, + provider_warnings + ) + if aligned_candidate: + top_candidate = self._score_candidates( + item_metadata, + album_group, + [self._merge_aligned_candidate(top_candidate, aligned_candidate)] + )[0] + candidates = self._score_candidates( + item_metadata, + album_group, + [top_candidate, *candidates[1:]] + ) + + runner_up = candidates[1] if len(candidates) > 1 else None + score_gap = top_candidate['score'] - (runner_up['score'] if runner_up else 0) + candidates_json = [_serialize_candidate(candidate) for candidate in candidates[:MAX_MATCH_CANDIDATES]] + enrichment = self._build_enrichment(top_candidate, config) + + if top_candidate['is_authoritative']: + if top_candidate['score'] >= 85 and score_gap >= 8: + return self._build_match_result( + 'matched', + 'authoritative_auto_match', + f'权威候选自动匹配成功,得分 {top_candidate["score"]:.1f}', + top_candidate, + candidates_json, + enrichment, + provider_warnings + ) + else: + if top_candidate['score'] >= 80 and score_gap >= 8: + return self._build_match_result( + 'matched_fallback', + 'fallback_auto_match', + f'Fallback 候选自动匹配成功,得分 {top_candidate["score"]:.1f}', + top_candidate, + candidates_json, + enrichment, + provider_warnings + ) + + reason = 'score_gap_too_small' if score_gap < 8 else 'score_below_threshold' + message = ( + f'候选最高分 {top_candidate["score"]:.1f},与次高分差 {score_gap:.1f},需人工复核' + ) + return self._build_match_result( + 'low_score', + reason, + message, + top_candidate, + candidates_json, + enrichment, + provider_warnings + ) + + def _collect_provider_candidates( + self, + provider_name: str, + search_provider, + provider_warnings: list[dict], + *args, + **kwargs + ) -> list[dict]: + try: + return search_provider(*args, **kwargs) + except MatchProviderError as error: + self._append_provider_warning(provider_name, error, provider_warnings) + return [] + + def _align_candidate_with_warnings( + self, + candidate: dict, + metadata_config: dict, + provider_warnings: list[dict] + ) -> dict | None: + try: + return self.musicbrainz_provider.align_candidate(candidate, metadata_config) + except MatchProviderError as error: + self._append_provider_warning('musicbrainz', error, provider_warnings) + return None + + def _append_provider_warning( + self, + provider_name: str, + error: MatchProviderError, + provider_warnings: list[dict] + ): + provider_warnings.append( + { + 'provider': getattr(error, 'provider', None) or provider_name, + 'message': str(error) + } + ) + + def _score_candidates( + self, + item_metadata: dict, + album_group: list[dict], + candidates: list[dict] + ) -> list[dict]: + scored_candidates = [] + for candidate in _dedupe_candidates(candidates): + identity_confidence = min( + 1.0, + max( + 0.0, + candidate.get('fingerprint_confidence') + or candidate.get('search_confidence') + or 0.0 + ) + ) + score_breakdown = { + 'fingerprint': round(30 * identity_confidence, 2), + 'title': round(20 * _text_similarity(item_metadata.get('title'), candidate.get('title')), 2), + 'artist': round(15 * _artist_similarity(item_metadata, candidate), 2), + 'album': round(10 * _text_similarity(item_metadata.get('album'), candidate.get('album')), 2), + 'duration': round(10 * _duration_similarity( + item_metadata.get('duration_seconds'), + candidate.get('duration_seconds') + ), 2), + 'track_disc': round(5 * _track_disc_similarity(item_metadata, candidate), 2), + 'album_context': round(10 * _album_context_similarity(album_group, candidate), 2), + 'version_penalty': round(_version_penalty(item_metadata, candidate), 2) + } + total_score = round( + max( + 0.0, + min( + 100.0, + sum( + value + for key, value in score_breakdown.items() + if key != 'version_penalty' + ) - score_breakdown['version_penalty'] + ) + ), + 2 + ) + scored_candidates.append( + { + **candidate, + 'score': total_score, + 'score_breakdown': score_breakdown + } + ) + + return sorted( + scored_candidates, + key=lambda candidate: ( + candidate['score'], + 1 if candidate.get('is_authoritative') else 0, + _provider_rank(candidate.get('provider')) + ), + reverse=True + ) + + def _merge_aligned_candidate(self, fallback_candidate: dict, aligned_candidate: dict) -> dict: + merged_source_ids = { + **(fallback_candidate.get('source_ids') or {}), + **(aligned_candidate.get('source_ids') or {}) + } + return { + **fallback_candidate, + **aligned_candidate, + 'provider': 'musicbrainz', + 'is_authoritative': True, + 'source_ids': merged_source_ids, + 'cover_url': fallback_candidate.get('cover_url') or aligned_candidate.get('cover_url'), + 'lyrics_url': fallback_candidate.get('lyrics_url') or aligned_candidate.get('lyrics_url') + } + + def _build_enrichment(self, candidate: dict, config: dict) -> dict: + if not candidate: + return { + 'cover': {'selected_source': None, 'candidates': []}, + 'lyrics': {'selected_source': None, 'candidates': []}, + 'genres': {'selected_source': None, 'candidates': []}, + 'tags': {'selected_source': None, 'candidates': []} + } + + enrichment_candidates = { + 'cover': [], + 'lyrics': [], + 'genres': [], + 'tags': [] + } + + if candidate.get('cover_url'): + enrichment_candidates['cover'].append( + { + 'provider': candidate['provider'], + 'value': candidate['cover_url'], + 'quality': 0.6, + 'source_id': (candidate.get('source_ids') or {}).get(f'{candidate["provider"]}_album_id') + } + ) + + if candidate.get('lyrics_url'): + enrichment_candidates['lyrics'].append( + { + 'provider': candidate['provider'], + 'value': candidate['lyrics_url'], + 'quality': 0.6, + 'source_id': (candidate.get('source_ids') or {}).get(f'{candidate["provider"]}_song_id') + } + ) + + if (config.get('advancedStrategy') or {}).get('downloadAssets', True): + for provider in ( + self.discogs_provider.enrich(candidate, config), + self.lastfm_provider.enrich(candidate, config), + self.genius_provider.enrich(candidate, config) + ): + if not provider: + continue + if provider.get('cover_url'): + enrichment_candidates['cover'].append( + { + 'provider': provider['provider'], + 'value': provider['cover_url'], + 'quality': provider.get('quality', 0.5), + 'source_id': provider.get('source_id') + } + ) + if provider.get('lyrics_url'): + enrichment_candidates['lyrics'].append( + { + 'provider': provider['provider'], + 'value': provider['lyrics_url'], + 'quality': provider.get('quality', 0.5), + 'source_id': provider.get('source_id') + } + ) + if provider.get('genres'): + enrichment_candidates['genres'].append( + { + 'provider': provider['provider'], + 'value': provider['genres'], + 'quality': provider.get('quality', 0.5), + 'source_id': provider.get('source_id') + } + ) + if provider.get('tags'): + enrichment_candidates['tags'].append( + { + 'provider': provider['provider'], + 'value': provider['tags'], + 'quality': provider.get('quality', 0.5), + 'source_id': provider.get('source_id') + } + ) + + return { + key: { + 'selected_source': _pick_best_candidate(value_candidates), + 'candidates': value_candidates + } + for key, value_candidates in enrichment_candidates.items() + } + + def _build_match_result( + self, + status: str, + reason: str, + message: str, + candidate: dict, + candidates_json: list[dict], + enrichment: dict, + provider_warnings: list[dict] + ) -> dict: + return { + 'status': status, + 'reason': reason, + 'message': message, + 'source': candidate.get('provider'), + 'confidence': candidate.get('score'), + 'is_authoritative': bool(candidate.get('is_authoritative')), + 'matched_metadata_json': _serialize_metadata(candidate), + 'match_candidates_json': candidates_json, + 'match_enrichment_json': enrichment, + 'provider_warnings': provider_warnings + } + + +def _append_query_params(url: str, params: dict | None) -> str: + if not params: + return url + query = parse.urlencode( + { + key: value + for key, value in params.items() + if value is not None and value != '' + }, + doseq=True + ) + separator = '&' if parse.urlparse(url).query else '?' + return f'{url}{separator}{query}' if query else url + + +def _normalize_base_url(value: str | None) -> str: + if not value: + return '' + return value.rstrip('/') + + +def _build_text_query(item_metadata: dict) -> str: + query_parts = [ + item_metadata.get('title'), + item_metadata.get('artist'), + item_metadata.get('album') + ] + return ' '.join(part.strip() for part in query_parts if isinstance(part, str) and part.strip()) + + +def _build_input_metadata(item: dict) -> dict: + tags = item.get('original_tags_json') or {} + audio_props = item.get('audio_props_json') or {} + inferred_title = Path(item.get('relative_path') or item.get('filename') or '').stem + release_date = tags.get('date') or tags.get('year') + + return { + 'title': _first_non_empty(tags.get('title'), inferred_title), + 'artist': _first_non_empty(tags.get('artist'), tags.get('album_artist')), + 'artists': _split_artists(tags.get('artist')), + 'album': tags.get('album'), + 'album_artist': tags.get('album_artist'), + 'track_number': _parse_track_number(tags.get('track_number') or tags.get('track')), + 'disc_number': _parse_track_number(tags.get('disc_number') or tags.get('disc')), + 'duration_seconds': ( + item.get('fingerprint_duration_seconds') + or audio_props.get('duration_seconds') + ), + 'release_date': release_date, + 'year': _extract_year(release_date) + } + + +def _serialize_metadata(candidate: dict) -> dict: + return { + 'title': candidate.get('title'), + 'artist': candidate.get('artist'), + 'artists': candidate.get('artists') or [], + 'album': candidate.get('album'), + 'album_artist': candidate.get('album_artist'), + 'track_number': candidate.get('track_number'), + 'disc_number': candidate.get('disc_number'), + 'release_date': candidate.get('release_date'), + 'year': candidate.get('year'), + 'duration_seconds': candidate.get('duration_seconds'), + 'recording_id': candidate.get('recording_id'), + 'release_id': candidate.get('release_id'), + 'release_group_id': candidate.get('release_group_id'), + 'source_ids': candidate.get('source_ids') or {} + } + + +def _serialize_candidate(candidate: dict) -> dict: + return { + 'provider': candidate.get('provider'), + 'score': candidate.get('score'), + 'score_breakdown': candidate.get('score_breakdown') or {}, + 'is_authoritative': bool(candidate.get('is_authoritative')), + 'title': candidate.get('title'), + 'artist': candidate.get('artist'), + 'album': candidate.get('album'), + 'recording_id': candidate.get('recording_id'), + 'release_id': candidate.get('release_id'), + 'release_group_id': candidate.get('release_group_id'), + 'source_ids': candidate.get('source_ids') or {} + } + + +def _dedupe_candidates(candidates: list[dict]) -> list[dict]: + deduped: dict[str, dict] = {} + for candidate in candidates: + dedupe_key = '|'.join( + [ + candidate.get('provider') or '', + candidate.get('recording_id') or '', + candidate.get('release_id') or '', + _normalize_text(candidate.get('title')), + _normalize_text(candidate.get('artist')), + _normalize_text(candidate.get('album')) + ] + ) + current = deduped.get(dedupe_key) + if current is None or (candidate.get('score') or 0) > (current.get('score') or 0): + deduped[dedupe_key] = candidate + return list(deduped.values()) + + +def _extract_artist_names(artist_credit: list[dict]) -> list[str]: + names = [] + for artist in artist_credit: + if artist.get('name'): + names.append(artist['name']) + continue + nested_artist = artist.get('artist') or {} + if nested_artist.get('name'): + names.append(nested_artist['name']) + return names + + +def _join_artists(artists: list[str]) -> str | None: + if not artists: + return None + return ', '.join(artists) + + +def _milliseconds_to_seconds(value) -> float | None: + if value in (None, ''): + return None + return round(_to_float(value) / 1000.0, 2) + + +def _find_release_track(release: dict, recording_id: str) -> dict: + for medium in release.get('media') or []: + disc_number = _parse_track_number(medium.get('position')) + for track in medium.get('tracks') or []: + nested_recording = track.get('recording') or {} + if nested_recording.get('id') == recording_id: + return { + **track, + 'disc_number': disc_number + } + return {} + + +def _build_release_tracklist(release: dict) -> list[dict]: + tracklist = [] + for medium in release.get('media') or []: + disc_number = _parse_track_number(medium.get('position')) + for track in medium.get('tracks') or []: + nested_recording = track.get('recording') or {} + tracklist.append( + { + 'title': track.get('title') or nested_recording.get('title'), + 'track_number': _parse_track_number(track.get('position') or track.get('number')), + 'disc_number': disc_number, + 'duration_seconds': _milliseconds_to_seconds(track.get('length')) + } + ) + return tracklist + + +def _text_similarity(left: str | None, right: str | None) -> float: + normalized_left = _normalize_text(left) + normalized_right = _normalize_text(right) + if not normalized_left or not normalized_right: + return 0.0 + if normalized_left == normalized_right: + return 1.0 + left_tokens = set(normalized_left.split()) + right_tokens = set(normalized_right.split()) + overlap = len(left_tokens & right_tokens) + return overlap / max(len(left_tokens), len(right_tokens), 1) + + +def _artist_similarity(item_metadata: dict, candidate: dict) -> float: + artist_candidates = [ + candidate.get('artist'), + _join_artists(candidate.get('artists') or []), + candidate.get('album_artist') + ] + return max( + (_text_similarity(item_metadata.get('artist'), artist_name) for artist_name in artist_candidates), + default=0.0 + ) + + +def _duration_similarity(source_duration, candidate_duration) -> float: + if source_duration in (None, '') or candidate_duration in (None, ''): + return 0.0 + delta = abs(_to_float(source_duration) - _to_float(candidate_duration)) + if delta <= 1: + return 1.0 + if delta <= 3: + return 0.8 + if delta <= 5: + return 0.6 + if delta <= 10: + return 0.3 + return 0.0 + + +def _track_disc_similarity(item_metadata: dict, candidate: dict) -> float: + score = 0.0 + if item_metadata.get('track_number') and candidate.get('track_number'): + if item_metadata['track_number'] == candidate['track_number']: + score += 0.6 + if item_metadata.get('disc_number') and candidate.get('disc_number'): + if item_metadata['disc_number'] == candidate['disc_number']: + score += 0.4 + return score + + +def _album_context_similarity(album_group: list[dict], candidate: dict) -> float: + release_tracklist = candidate.get('release_tracklist') or [] + if len(album_group) < 2 or not release_tracklist: + return 0.0 + + comparable_items = 0 + matched_items = 0 + for item in album_group: + item_metadata = _build_input_metadata(item) + track_number = item_metadata.get('track_number') + if not track_number: + continue + comparable_items += 1 + track_match = next( + ( + track + for track in release_tracklist + if track.get('track_number') == track_number + and ( + not item_metadata.get('disc_number') + or not track.get('disc_number') + or track.get('disc_number') == item_metadata.get('disc_number') + ) + ), + None + ) + if not track_match: + continue + title_ok = _text_similarity(item_metadata.get('title'), track_match.get('title')) >= 0.7 + duration_ok = _duration_similarity( + item_metadata.get('duration_seconds'), + track_match.get('duration_seconds') + ) >= 0.6 + if title_ok or duration_ok: + matched_items += 1 + + if comparable_items == 0: + return 0.0 + return matched_items / comparable_items + + +def _version_penalty(item_metadata: dict, candidate: dict) -> float: + item_tokens = _extract_version_tokens(item_metadata.get('title')) + candidate_tokens = _extract_version_tokens(candidate.get('title')) + if not item_tokens and not candidate_tokens: + return 0.0 + if item_tokens == candidate_tokens: + return 0.0 + return 8.0 + + +def _extract_version_tokens(value: str | None) -> set[str]: + normalized = _normalize_text(value) + if not normalized: + return set() + return {token for token in normalized.split() if token in VERSION_TOKENS} + + +def _parse_track_number(value) -> int | None: + if value in (None, ''): + return None + match = re.search(r'\d+', str(value)) + return int(match.group(0)) if match else None + + +def _extract_year(value: str | None) -> int | None: + if not value: + return None + match = re.search(r'(\d{4})', str(value)) + return int(match.group(1)) if match else None + + +def _normalize_text(value: str | None) -> str: + if not value: + return '' + cleaned = re.sub(r'[^a-z0-9]+', ' ', str(value).lower()) + return ' '.join(cleaned.split()) + + +def _first_non_empty(*values): + for value in values: + if isinstance(value, str) and value.strip(): + return value.strip() + return None + + +def _split_artists(value: str | None) -> list[str]: + if not value: + return [] + return [part.strip() for part in re.split(r'[,/&]| feat\. ', value) if part.strip()] + + +def _provider_rank(provider: str | None) -> int: + provider_order = { + 'acoustid': 6, + 'musicbrainz': 5, + 'netease': 4, + 'qq': 3, + 'spotify': 2 + } + return provider_order.get(provider or '', 0) + + +def _pick_best_candidate(candidates: list[dict]) -> dict | None: + if not candidates: + return None + return max(candidates, key=lambda candidate: candidate.get('quality', 0)) + + +def _unique_non_empty(values: list[str]) -> list[str]: + unique_values = [] + seen_values: set[str] = set() + for value in values: + if not value or value in seen_values: + continue + seen_values.add(value) + unique_values.append(value) + return unique_values + + +def _format_timestamp_date(value) -> str | None: + if value in (None, ''): + return None + if isinstance(value, (int, float)) and value > 1000: + return time.strftime('%Y-%m-%d', time.gmtime(value / 1000)) + return str(value) + + +def _to_float(value) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 diff --git a/backend/app/metadata_normalization.py b/backend/app/metadata_normalization.py new file mode 100644 index 0000000..2c8613e --- /dev/null +++ b/backend/app/metadata_normalization.py @@ -0,0 +1,309 @@ +from __future__ import annotations + +import re +import unicodedata +from collections import Counter +from pathlib import Path +from typing import Any + + +INGEST_REQUIRED_FIELDS = ('title', 'artist', 'album_artist') +VARIOUS_ARTISTS = 'Various Artists' +MAIN_ARTIST_THRESHOLD = 0.7 +ARTIST_SPLIT_PATTERN = re.compile( + r'\s*(?:/|;|,|&|、|\bx\b|\bfeat\.?(?=\s|$)|\bft\.?(?=\s|$)|\bfeaturing\b)\s*', + re.IGNORECASE +) +COMPILATION_KEYWORDS = ( + 'top', + 'hits', + 'best', + '精选', + 'ost', + 'original soundtrack', + 'soundtrack', + '原声带' +) + + +def merge_metadata_layers( + raw_metadata: dict[str, Any] | None, + matched_metadata: dict[str, Any] | None, + metadata_patch: dict[str, Any] | None = None +) -> dict[str, Any]: + merged = dict(raw_metadata or {}) + merged.update({key: value for key, value in (matched_metadata or {}).items() if value not in (None, '')}) + merged.update({key: value for key, value in (metadata_patch or {}).items() if value is not None}) + return merged + + +def normalize_metadata_shape(metadata: dict[str, Any] | None) -> dict[str, Any]: + normalized = dict(metadata or {}) + for key in ('title', 'artist', 'album', 'album_artist', 'lyrics', 'normalization_strategy', 'album_artist_reason'): + if key in normalized and normalized[key] is None: + normalized[key] = '' + if 'compilation' in normalized and normalized['compilation'] in ('', None): + normalized['compilation'] = 0 + return normalized + + +def can_ingest_metadata(metadata: dict[str, Any]) -> bool: + return all(str(metadata.get(field) or '').strip() for field in INGEST_REQUIRED_FIELDS) + + +class MetadataNormalizationService: + def __init__(self, task_store): + self.task_store = task_store + + def create_cache(self) -> dict[str, dict[Any, Any]]: + return { + 'task_items': {}, + 'group_entries': {}, + 'group_analysis': {} + } + + def normalize_item( + self, + item: dict, + metadata_patch: dict[str, Any] | None = None, + cache: dict[str, dict[Any, Any]] | None = None + ) -> dict[str, Any]: + merged = merge_metadata_layers( + item.get('original_tags_json'), + item.get('matched_metadata_json'), + metadata_patch + ) + group_cache_key, group_entries = self._build_group_entries(item, metadata_patch, cache) + return self._normalize_merged_metadata(merged, group_entries, cache, group_cache_key) + + def _build_group_entries( + self, + item: dict, + metadata_patch: dict[str, Any] | None, + cache: dict[str, dict[Any, Any]] | None + ) -> tuple[Any, list[dict[str, Any]]]: + current_merged = merge_metadata_layers( + item.get('original_tags_json'), + item.get('matched_metadata_json'), + metadata_patch + ) + task_items = self._get_task_items(item['task_id'], cache) + current_group_key = self._group_key(current_merged, item) + cache_key = self._group_cache_key(item, current_group_key, metadata_patch) + if cache is not None and cache_key in cache['group_entries']: + return cache_key, cache['group_entries'][cache_key] + entries: list[dict[str, Any]] = [] + + for candidate in task_items: + candidate_patch = metadata_patch if candidate['id'] == item['id'] else None + merged = merge_metadata_layers( + candidate.get('original_tags_json'), + candidate.get('matched_metadata_json'), + candidate_patch + ) + if self._group_key(merged, candidate) != current_group_key: + continue + entries.append( + { + 'item_id': candidate['id'], + 'metadata': merged, + 'artist_info': parse_artist_string(merged.get('artist')) + } + ) + + if not entries: + entries = [ + { + 'item_id': item['id'], + 'metadata': current_merged, + 'artist_info': parse_artist_string(current_merged.get('artist')) + } + ] + + if cache is not None: + cache['group_entries'][cache_key] = entries + + return cache_key, entries + + def _get_task_items( + self, + task_id: str, + cache: dict[str, dict[Any, Any]] | None + ) -> list[dict[str, Any]]: + if cache is None: + return self.task_store.list_all_task_items(task_id, active_only=True) + + task_items = cache['task_items'].get(task_id) + if task_items is None: + task_items = self.task_store.list_all_task_items(task_id, active_only=True) + cache['task_items'][task_id] = task_items + return task_items + + def _group_cache_key( + self, + item: dict, + current_group_key: tuple[str, str], + metadata_patch: dict[str, Any] | None + ) -> tuple[Any, ...]: + patch_key = self._metadata_patch_cache_key(metadata_patch) + if patch_key is None: + return (item['task_id'], current_group_key) + return (item['task_id'], current_group_key, item['id'], patch_key) + + def _metadata_patch_cache_key(self, metadata_patch: dict[str, Any] | None) -> tuple[tuple[str, str], ...] | None: + if not metadata_patch: + return None + return tuple(sorted((key, repr(value)) for key, value in metadata_patch.items())) + + def _group_key(self, metadata: dict[str, Any], item: dict) -> tuple[str, str]: + for key in ('release_id', 'release_group_id'): + value = _clean_token(metadata.get(key)) + if value: + return (key, value) + + album = _clean_token(metadata.get('album')) + if album: + return ('album', album) + + parent_dir = Path(item.get('relative_path') or item.get('filename') or '').parent.as_posix() + if parent_dir and parent_dir != '.': + return ('path', _clean_token(parent_dir)) + + return ('item', str(item['id'])) + + def _normalize_merged_metadata( + self, + merged: dict[str, Any], + group_entries: list[dict[str, Any]], + cache: dict[str, dict[Any, Any]] | None, + group_cache_key: Any + ) -> dict[str, Any]: + normalized = dict(merged) + artist_value = str(normalized.get('artist') or '').strip() + album_artist_value = str(normalized.get('album_artist') or '').strip() + album_value = str(normalized.get('album') or '').strip() + artist_info = parse_artist_string(artist_value) + + if album_artist_value: + normalized['album_artist'] = album_artist_value + normalized['normalization_strategy'] = 'source_preserved' + normalized['album_artist_reason'] = '保留来源或人工指定的专辑艺术家' + normalized['artist_tokens'] = artist_info['tokens'] + normalized['display_artist'] = artist_info['display_artist'] + normalized['compilation'] = 1 if _truthy_compilation(normalized.get('compilation')) else 0 + return normalize_metadata_shape(normalized) + + group_analysis = self._analyze_group_entries(group_entries, cache, group_cache_key) + dominant_artist = group_analysis['dominant_artist'] + dominant_ratio = group_analysis['dominant_ratio'] + unique_main_artists = group_analysis['unique_main_artists'] + has_collaboration_markup = group_analysis['has_collaboration_markup'] + compilation_keyword_hit = _has_compilation_keyword(album_value) + + strategy = 'unresolved' + album_artist = '' + reason = '无法从当前专辑分组推导专辑艺术家' + compilation = 0 + + if len(unique_main_artists) == 1 and dominant_artist and not has_collaboration_markup: + strategy = 'single_artist' + album_artist = dominant_artist + reason = '同专辑曲目的主艺人一致,按单艺人专辑处理' + elif len(unique_main_artists) == 1 and dominant_artist and has_collaboration_markup: + strategy = 'main_artist_feat' + album_artist = dominant_artist + reason = '同专辑主艺人一致,但存在 feat/合作曲目,按主艺人专辑处理' + elif dominant_artist and dominant_ratio >= MAIN_ARTIST_THRESHOLD: + strategy = 'main_artist_feat' + album_artist = dominant_artist + reason = f'主艺人 {dominant_artist} 在同专辑中占比达到 {dominant_ratio:.0%}' + elif len(unique_main_artists) > 1 and compilation_keyword_hit: + strategy = 'compilation' + album_artist = VARIOUS_ARTISTS + reason = '多艺人分散且专辑名命中合辑/原声带关键词' + compilation = 1 + elif dominant_artist: + strategy = 'dominant_artist_fallback' + album_artist = dominant_artist + reason = f'未命中合辑规则,回退到出现频次最高的主艺人 {dominant_artist}' + elif artist_value: + strategy = 'single_track_fallback' + album_artist = artist_info['primary'] or artist_value + reason = '仅有当前曲目可用,回退到当前曲目艺人' + + normalized['album_artist'] = album_artist + normalized['compilation'] = compilation + normalized['normalization_strategy'] = strategy + normalized['album_artist_reason'] = reason + normalized['artist_tokens'] = artist_info['tokens'] + normalized['display_artist'] = artist_info['display_artist'] + return normalize_metadata_shape(normalized) + + def _analyze_group_entries( + self, + group_entries: list[dict[str, Any]], + cache: dict[str, dict[Any, Any]] | None, + group_cache_key: Any + ) -> dict[str, Any]: + if cache is not None and group_cache_key in cache['group_analysis']: + return cache['group_analysis'][group_cache_key] + + main_artists = [entry['artist_info']['primary'] for entry in group_entries if entry['artist_info']['primary']] + has_collaboration_markup = any(len(entry['artist_info']['tokens']) > 1 for entry in group_entries) + main_artist_counts = Counter(main_artists) + unique_main_artists = set(main_artists) + dominant_artist, dominant_count = main_artist_counts.most_common(1)[0] if main_artist_counts else (None, 0) + analysis = { + 'dominant_artist': dominant_artist, + 'dominant_ratio': (dominant_count / len(main_artists)) if main_artists else 0.0, + 'unique_main_artists': unique_main_artists, + 'has_collaboration_markup': has_collaboration_markup + } + if cache is not None: + cache['group_analysis'][group_cache_key] = analysis + return analysis + + +def parse_artist_string(value: Any) -> dict[str, Any]: + display_artist = str(value or '').strip() + if not display_artist: + return {'display_artist': '', 'tokens': [], 'primary': ''} + + normalized = unicodedata.normalize('NFKC', display_artist) + tokens = [ + _normalize_artist_token(token) + for token in ARTIST_SPLIT_PATTERN.split(normalized) + if _normalize_artist_token(token) + ] + if not tokens: + tokens = [_normalize_artist_token(normalized)] if _normalize_artist_token(normalized) else [] + return { + 'display_artist': display_artist, + 'tokens': tokens, + 'primary': tokens[0] if tokens else '' + } + + +def _normalize_artist_token(value: str) -> str: + cleaned = unicodedata.normalize('NFKC', str(value or '')).strip() + cleaned = re.sub(r'\s+', ' ', cleaned) + return cleaned + + +def _clean_token(value: Any) -> str: + cleaned = unicodedata.normalize('NFKC', str(value or '')).strip().lower() + cleaned = re.sub(r'\s+', ' ', cleaned) + return cleaned + + +def _has_compilation_keyword(album: str) -> bool: + normalized = _clean_token(album) + return any(keyword in normalized for keyword in COMPILATION_KEYWORDS) + + +def _truthy_compilation(value: Any) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return value != 0 + return str(value or '').strip().lower() in {'1', 'true', 'yes'} diff --git a/backend/app/metadata_status.py b/backend/app/metadata_status.py new file mode 100644 index 0000000..2ab6c31 --- /dev/null +++ b/backend/app/metadata_status.py @@ -0,0 +1,98 @@ +from time import perf_counter +from urllib import error, request + + +LATENCY_WARNING_MS = 500 +REQUEST_TIMEOUT_SECONDS = 3 + + +def make_status(status, message, latency_ms=None): + return { + 'status': status, + 'latencyMs': latency_ms, + 'message': message + } + + +def probe_url(url): + started_at = perf_counter() + req = request.Request( + url, + headers={'User-Agent': 'MusicWorkshop/0.1'}, + method='GET' + ) + + try: + with request.urlopen(req, timeout=REQUEST_TIMEOUT_SECONDS) as response: + status_code = response.status + except error.HTTPError as exc: + status_code = exc.code + except (error.URLError, ValueError): + return make_status('offline', '无法连接') + + latency_ms = max(1, round((perf_counter() - started_at) * 1000)) + + if 500 <= status_code: + return make_status('offline', f'服务异常 (HTTP {status_code})') + + status = 'warning' if latency_ms >= LATENCY_WARNING_MS else 'online' + message = ( + f'高延迟 (HTTP {status_code})' + if status == 'warning' + else f'可达 (HTTP {status_code})' + ) + return make_status(status, message, latency_ms) + + +def probe_metadata_services(metadata_config): + services = { + 'acoustid': { + 'url': metadata_config['acoustidUrl'], + 'required_credentials': [metadata_config['acoustidClientKey']] + }, + 'musicbrainz': { + 'url': metadata_config['musicbrainz'] + }, + 'netease': { + 'url': metadata_config['netease'] + }, + 'qq': { + 'url': metadata_config['qq'] + }, + 'spotify': { + 'url': metadata_config['spotifyUrl'], + 'required_credentials': [ + metadata_config['spotifyClientId'], + metadata_config['spotifySecret'] + ] + }, + 'discogs': { + 'url': metadata_config['discogsUrl'], + 'required_credentials': [metadata_config['discogsToken']] + }, + 'lastfm': { + 'url': metadata_config['lastfmUrl'], + 'required_credentials': [metadata_config['lastfmKey']] + }, + 'genius': { + 'url': metadata_config['geniusUrl'], + 'required_credentials': [metadata_config['geniusToken']] + } + } + + statuses = {} + + for service_name, service_config in services.items(): + credentials = service_config.get('required_credentials', []) + if credentials and not all(value.strip() for value in credentials): + statuses[service_name] = make_status('none', '缺失凭据,跳过测试') + continue + + url = service_config['url'].strip() + if not url: + statuses[service_name] = make_status('none', '未配置地址,跳过测试') + continue + + statuses[service_name] = probe_url(url) + + return statuses diff --git a/backend/app/preprocessor.py b/backend/app/preprocessor.py new file mode 100644 index 0000000..279c803 --- /dev/null +++ b/backend/app/preprocessor.py @@ -0,0 +1,399 @@ +import importlib +import json +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from .task_constants import TASK_WORKSPACE_ROOT + + +FORCED_FLAC_EXTENSIONS = {'.ape', '.wav', '.wma'} + + +class PreprocessDependencyError(Exception): + pass + + +class PreprocessItemError(Exception): + def __init__(self, reason: str, message: str): + super().__init__(message) + self.reason = reason + self.message = message + + +@dataclass +class CueTrack: + number: int + title: str | None + performer: str | None + index_seconds: float + + +@dataclass +class CueSheet: + cue_path: Path + album_title: str | None + album_performer: str | None + tracks: list[CueTrack] + + +class Preprocessor: + def check_dependencies(self) -> dict[str, str]: + binaries = { + 'ffmpeg': shutil.which('ffmpeg'), + 'ffprobe': shutil.which('ffprobe'), + 'fpcalc': shutil.which('fpcalc') + } + missing_binaries = [name for name, path in binaries.items() if not path] + + try: + importlib.import_module('mutagen') + except ModuleNotFoundError as error: + raise PreprocessDependencyError('缺少 Python 依赖 mutagen') from error + + if missing_binaries: + raise PreprocessDependencyError( + f'缺少预处理依赖: {", ".join(sorted(missing_binaries))}' + ) + + return binaries + + def create_workspace(self, task_id: str) -> dict[str, Path]: + preprocess_root = Path(TASK_WORKSPACE_ROOT) / task_id / 'preprocess' + split_dir = preprocess_root / 'split' + converted_dir = preprocess_root / 'converted' + covers_dir = preprocess_root / 'covers' + + split_dir.mkdir(parents=True, exist_ok=True) + converted_dir.mkdir(parents=True, exist_ok=True) + covers_dir.mkdir(parents=True, exist_ok=True) + + return { + 'root': preprocess_root, + 'split': split_dir, + 'converted': converted_dir, + 'covers': covers_dir + } + + def find_matching_cue(self, audio_path: str) -> Path | None: + source_path = Path(audio_path) + directory = source_path.parent + target_name = f'{source_path.stem}.cue'.lower() + + if not directory.exists(): + return None + + for entry in directory.iterdir(): + if entry.is_symlink() or not entry.is_file(): + continue + if entry.name.lower() == target_name: + return entry.resolve(strict=False) + + return None + + def parse_cue(self, cue_path: Path) -> CueSheet: + album_title = None + album_performer = None + tracks: list[CueTrack] = [] + current_track: dict | None = None + + for raw_line in cue_path.read_text(encoding='utf-8', errors='replace').splitlines(): + line = raw_line.strip() + if not line: + continue + + keyword, _, remainder = line.partition(' ') + keyword = keyword.upper() + remainder = remainder.strip() + + if keyword == 'TRACK': + if current_track and current_track.get('index_seconds') is not None: + tracks.append(CueTrack(**current_track)) + parts = remainder.split() + if len(parts) < 2 or not parts[0].isdigit(): + raise PreprocessItemError('split_failed', 'CUE TRACK 行格式无效') + current_track = { + 'number': int(parts[0]), + 'title': None, + 'performer': None, + 'index_seconds': None + } + elif keyword == 'TITLE': + value = _strip_cue_value(remainder) + if current_track is None: + album_title = value + else: + current_track['title'] = value + elif keyword == 'PERFORMER': + value = _strip_cue_value(remainder) + if current_track is None: + album_performer = value + else: + current_track['performer'] = value + elif keyword == 'INDEX' and current_track is not None: + parts = remainder.split() + if len(parts) >= 2 and parts[0] == '01': + current_track['index_seconds'] = _cue_time_to_seconds(parts[1]) + + if current_track and current_track.get('index_seconds') is not None: + tracks.append(CueTrack(**current_track)) + + if not tracks: + raise PreprocessItemError('split_failed', 'CUE 中未找到可用 TRACK/INDEX 记录') + + return CueSheet( + cue_path=cue_path, + album_title=album_title, + album_performer=album_performer, + tracks=tracks + ) + + def split_cue_tracks( + self, + source_path: str, + cue_sheet: CueSheet, + output_dir: Path, + total_duration_seconds: float | None + ) -> list[dict]: + output_dir.mkdir(parents=True, exist_ok=True) + generated_tracks: list[dict] = [] + + for index, track in enumerate(cue_sheet.tracks): + output_path = output_dir / f'track_{track.number:02d}.flac' + command = [ + 'ffmpeg', + '-y', + '-hide_banner', + '-loglevel', + 'error', + '-ss', + str(track.index_seconds), + '-i', + source_path + ] + + next_track = cue_sheet.tracks[index + 1] if index + 1 < len(cue_sheet.tracks) else None + if next_track is not None: + command.extend(['-to', str(next_track.index_seconds - track.index_seconds)]) + elif total_duration_seconds is not None: + command.extend(['-to', str(max(total_duration_seconds - track.index_seconds, 0.01))]) + + command.extend(['-map', '0:a:0', '-vn', '-c:a', 'flac', str(output_path)]) + self._run_command(command, 'split_failed', f'CUE 切轨失败: {output_path.name}') + + generated_tracks.append( + { + 'path': str(output_path.resolve(strict=False)), + 'filename': output_path.name, + 'track_number': track.number, + 'title': track.title, + 'artist': track.performer or cue_sheet.album_performer, + 'album': cue_sheet.album_title, + 'album_artist': cue_sheet.album_performer + } + ) + + return generated_tracks + + def convert_to_flac(self, source_path: str, output_path: Path) -> str: + output_path.parent.mkdir(parents=True, exist_ok=True) + command = [ + 'ffmpeg', + '-y', + '-hide_banner', + '-loglevel', + 'error', + '-i', + source_path, + '-map', + '0:a:0', + '-vn', + '-c:a', + 'flac', + str(output_path) + ] + self._run_command(command, 'convert_failed', '音频转码失败') + return str(output_path.resolve(strict=False)) + + def probe_audio(self, file_path: str) -> dict: + command = [ + 'ffprobe', + '-v', + 'error', + '-print_format', + 'json', + '-show_format', + '-show_streams', + file_path + ] + result = self._run_command(command, 'probe_failed', '音频信息探测失败') + + try: + payload = json.loads(result.stdout) + except json.JSONDecodeError as error: + raise PreprocessItemError('probe_failed', 'ffprobe 输出无法解析') from error + + audio_stream = next( + (stream for stream in payload.get('streams', []) if stream.get('codec_type') == 'audio'), + None + ) + if audio_stream is None: + raise PreprocessItemError('probe_failed', '未找到可用音频流') + + format_info = payload.get('format', {}) + duration_seconds = _safe_float(format_info.get('duration')) or _safe_float(audio_stream.get('duration')) + + return { + 'format': (format_info.get('format_name') or Path(file_path).suffix.lstrip('.')).upper(), + 'codec': (audio_stream.get('codec_name') or '').upper() or None, + 'bitrate': _safe_int(audio_stream.get('bit_rate')) or _safe_int(format_info.get('bit_rate')), + 'sample_rate': _safe_int(audio_stream.get('sample_rate')), + 'bit_depth': _safe_int(audio_stream.get('bits_per_raw_sample')) or _safe_int(audio_stream.get('bits_per_sample')), + 'channels': _safe_int(audio_stream.get('channels')), + 'duration_seconds': round(duration_seconds, 3) if duration_seconds is not None else None + } + + def read_tags(self, file_path: str) -> dict: + mutagen = importlib.import_module('mutagen') + tags_file = mutagen.File(file_path, easy=True) + + if tags_file is None or not getattr(tags_file, 'tags', None): + return {} + + def first_value(key: str) -> str | None: + value = tags_file.tags.get(key) + if isinstance(value, list) and value: + return str(value[0]) + if value is not None: + return str(value) + return None + + return { + 'title': first_value('title'), + 'artist': first_value('artist'), + 'album': first_value('album'), + 'album_artist': first_value('albumartist'), + 'track_number': first_value('tracknumber'), + 'disc_number': first_value('discnumber'), + 'date': first_value('date'), + 'genre': first_value('genre') + } + + def extract_embedded_cover(self, file_path: str, output_path: Path) -> str | None: + mutagen = importlib.import_module('mutagen') + tags_file = mutagen.File(file_path) + + if tags_file is None: + return None + + image_bytes = None + if getattr(tags_file, 'pictures', None): + if tags_file.pictures: + image_bytes = tags_file.pictures[0].data + elif getattr(tags_file, 'tags', None): + tags = tags_file.tags + apic_keys = [key for key in tags.keys() if str(key).startswith('APIC')] + if apic_keys: + image_bytes = tags[apic_keys[0]].data + elif 'covr' in tags and tags['covr']: + image_bytes = bytes(tags['covr'][0]) + + if not image_bytes: + return None + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_bytes(image_bytes) + return str(output_path.resolve(strict=False)) + + def calculate_fingerprint(self, file_path: str) -> dict: + command = [ + 'fpcalc', + '-length', + '120', + file_path + ] + result = self._run_command(command, 'fingerprint_failed', '声学指纹计算失败') + + duration = None + fingerprint = None + for line in result.stdout.splitlines(): + if line.startswith('DURATION='): + duration = _safe_float(line.split('=', 1)[1]) + elif line.startswith('FINGERPRINT='): + fingerprint = line.split('=', 1)[1].strip() + + if not fingerprint: + raise PreprocessItemError('fingerprint_failed', '未生成有效声学指纹') + + return { + 'fingerprint': fingerprint, + 'duration_seconds': duration + } + + def _run_command( + self, + command: list[str], + reason: str, + message: str + ) -> subprocess.CompletedProcess[str]: + try: + return subprocess.run( + command, + check=True, + capture_output=True, + text=True + ) + except subprocess.CalledProcessError as error: + stderr = error.stderr.strip() if error.stderr else '' + detail = f'{message}: {stderr}' if stderr else message + raise PreprocessItemError(reason, detail) from error + + +def build_preprocess_paths(task_id: str, item_id: int) -> dict[str, Path]: + root = Path(TASK_WORKSPACE_ROOT) / task_id / 'preprocess' + return { + 'root': root, + 'split': root / 'split' / str(item_id), + 'converted': root / 'converted' / f'{item_id}.flac', + 'cover': root / 'covers' / f'{item_id}.jpg' + } + + +def build_split_child_relative_path(parent_relative_path: str, filename: str) -> str: + parent_path = Path(parent_relative_path) + return (parent_path.parent / filename).as_posix() + + +def merge_tag_snapshots(primary: dict | None, fallback: dict | None) -> dict: + merged: dict[str, str | None] = {} + for source in (primary or {}, fallback or {}): + for key, value in source.items(): + if value is not None and merged.get(key) in (None, ''): + merged[key] = value + return merged + + +def _cue_time_to_seconds(value: str) -> float: + minute, second, frame = value.split(':') + return int(minute) * 60 + int(second) + (int(frame) / 75.0) + + +def _strip_cue_value(value: str) -> str: + if value.startswith('"') and value.endswith('"'): + return value[1:-1] + return value + + +def _safe_int(value) -> int | None: + try: + return int(value) if value not in (None, '') else None + except (TypeError, ValueError): + return None + + +def _safe_float(value) -> float | None: + try: + return float(value) if value not in (None, '') else None + except (TypeError, ValueError): + return None diff --git a/backend/app/repair_runner.py b/backend/app/repair_runner.py new file mode 100644 index 0000000..1210f52 --- /dev/null +++ b/backend/app/repair_runner.py @@ -0,0 +1,841 @@ +import importlib +import shutil +from pathlib import Path + +from .exception_service import ExceptionService +from .library_postprocess import ( + OrganizeItemError, + _build_organize_plan, + _build_prefixed_name, + _build_quality_breakdown, + _build_unique_destination, + _serialize_compared_candidate +) +from .metadata_normalization import ( + MetadataNormalizationService, + can_ingest_metadata, + merge_metadata_layers +) +from .matcher import Matcher +from .preprocessor import PreprocessItemError, Preprocessor +from .task_constants import ( + STAGE_STATUS_COMPLETED, + STAGE_STATUS_FAILED, + STAGE_STATUS_RUNNING, + TASK_STATUS_COMPLETED, + TASK_STATUS_FAILED, + TASK_STATUS_RUNNING, + current_timestamp, + create_empty_repair_stats, + create_pending_repair_stage_states +) + + +class RepairExecutionError(Exception): + def __init__(self, reason: str, message: str): + super().__init__(message) + self.reason = reason + self.message = message + +class MetadataPatchService: + def apply(self, item: dict, metadata_patch: dict) -> tuple[dict, bool]: + patch = {key: value for key, value in (metadata_patch or {}).items() if value is not None} + if not patch: + return item, False + + normalized_metadata = self.metadata_normalizer.normalize_item(item, patch) + merged_tags = dict(item.get('original_tags_json') or {}) + merged_tags.update({key: value for key, value in normalized_metadata.items() if key in self.writable_keys}) + + file_path = Path(item['current_file_path']) + if file_path.exists(): + self._write_tags(file_path, merged_tags) + + updated_item = self.task_store.update_task_item( + item['id'], + original_tags_json=merged_tags, + matched_metadata_json=normalized_metadata + ) + return updated_item, True + + def __init__(self, task_store, metadata_normalizer: MetadataNormalizationService): + self.task_store = task_store + self.metadata_normalizer = metadata_normalizer + self.writable_keys = {'title', 'artist', 'album', 'album_artist', 'track_number', 'disc_number', 'year', 'lyrics'} + + def _write_tags(self, file_path: Path, tags: dict): + mutagen = importlib.import_module('mutagen') + tags_file = mutagen.File(str(file_path), easy=True) + if tags_file is None: + raise RepairExecutionError('metadata_write_failed', f'无法写入标签: {file_path}') + + key_mapping = { + 'title': 'title', + 'artist': 'artist', + 'album': 'album', + 'album_artist': 'albumartist', + 'track_number': 'tracknumber', + 'disc_number': 'discnumber', + 'year': 'date', + 'lyrics': 'lyrics' + } + for source_key, target_key in key_mapping.items(): + if source_key not in tags: + continue + value = tags[source_key] + if value in (None, ''): + continue + tags_file[target_key] = [str(value)] + tags_file.save() + + +class OrganizeService: + def __init__(self, task_store, metadata_normalizer: MetadataNormalizationService): + self.task_store = task_store + self.metadata_normalizer = metadata_normalizer + + def plan(self, item: dict, output_root: str, override_relative_path: str | None = None) -> dict: + root = Path(output_root).expanduser().resolve(strict=False) + if override_relative_path: + planned_relative_path = Path(override_relative_path).as_posix().lstrip('/') + return {'output_root': root, 'planned_relative_path': planned_relative_path} + normalized_item = { + **item, + 'matched_metadata_json': self.metadata_normalizer.normalize_item(item) + } + return _build_organize_plan(root, normalized_item) + + def resolve_destination(self, desired_path: Path, source_path: Path) -> tuple[Path, int]: + candidate = desired_path + collision_index = 1 + + while candidate.exists(): + if candidate.resolve(strict=False) == source_path.resolve(strict=False): + return candidate, collision_index + collision_index += 1 + candidate = candidate.with_name( + f'{desired_path.stem} ({collision_index}){desired_path.suffix}' + ) + + return candidate, collision_index + + def move_to_review_trash( + self, + *, + trash_root: str, + task_id: str, + item_id: int | None, + source_path: str, + reason: str + ) -> str: + source = Path(source_path) + if not source.exists(): + raise RepairExecutionError('source_missing', f'源文件不存在: {source}') + destination = _build_unique_destination( + Path(trash_root) / reason / task_id, + _build_prefixed_name(item_id, source.name) + ) + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(source), str(destination)) + return str(destination.resolve(strict=False)) + + def organize_item( + self, + item: dict, + *, + output_root: str, + override_relative_path: str | None = None + ) -> tuple[dict, dict]: + normalized_metadata = self.metadata_normalizer.normalize_item(item) + if normalized_metadata != (item.get('matched_metadata_json') or {}): + item = self.task_store.update_task_item(item['id'], matched_metadata_json=normalized_metadata) + plan = self.plan(item, output_root, override_relative_path) + source_path = Path(item['current_file_path']) + if not source_path.exists(): + raise RepairExecutionError('source_missing', f'源文件不存在: {source_path}') + desired_path = Path(plan['output_root']) / plan['planned_relative_path'] + final_path, collision_count = self.resolve_destination(desired_path, source_path) + final_path.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(source_path), str(final_path)) + final_relative_path = final_path.relative_to(plan['output_root']).as_posix() + + updated_item = self.task_store.update_task_item( + item['id'], + current_file_path=str(final_path.resolve(strict=False)), + filename=final_path.name, + organize_status='organized', + organize_reason=None, + organize_message='已按标准路径入库', + library_relative_path=final_relative_path, + library_file_path=str(final_path.resolve(strict=False)), + organize_decision_json={ + 'source_path': item['current_file_path'], + 'planned_relative_path': plan['planned_relative_path'], + 'final_relative_path': final_relative_path, + 'collision_strategy': 'suffix' if collision_count > 1 else 'none', + 'trashed_on_failure': None, + 'final_action': 'organized' + } + ) + return updated_item, { + 'planned_relative_path': plan['planned_relative_path'], + 'final_relative_path': final_relative_path, + 'collision_count': collision_count + } + + +class MatchRetryService: + def __init__(self, task_store, matcher: Matcher, metadata_normalizer: MetadataNormalizationService): + self.task_store = task_store + self.matcher = matcher + self.metadata_normalizer = metadata_normalizer + + def retry_match(self, item: dict, config_snapshot: dict, providers: list[str] | None = None) -> dict: + match_config = dict(config_snapshot) + if providers: + match_config['repair_provider_scope'] = providers + result = self.matcher.match_item(item, [item], match_config) + matched_metadata = self.metadata_normalizer.normalize_item( + { + **item, + 'matched_metadata_json': result['matched_metadata_json'] + } + ) + return self.task_store.update_task_item( + item['id'], + match_status=result['status'], + match_reason=result['reason'], + match_message=result['message'], + match_source=result['source'], + match_confidence=result['confidence'], + match_is_authoritative=1 if result['is_authoritative'] else 0, + matched_metadata_json=matched_metadata, + match_candidates_json=result['match_candidates_json'], + match_enrichment_json=result['match_enrichment_json'] + ) + + def select_candidate(self, item: dict, candidate_index: int) -> dict: + candidates = item.get('match_candidates_json') or [] + if candidate_index < 0 or candidate_index >= len(candidates): + raise RepairExecutionError('candidate_index_invalid', '候选索引无效') + candidate = candidates[candidate_index] + matched_metadata = { + 'title': candidate.get('title'), + 'artist': candidate.get('artist'), + 'album': candidate.get('album'), + 'album_artist': candidate.get('album_artist'), + 'track_number': candidate.get('track_number'), + 'disc_number': candidate.get('disc_number'), + 'year': candidate.get('year'), + 'lyrics': candidate.get('lyrics'), + 'recording_id': candidate.get('recording_id'), + 'release_id': candidate.get('release_id'), + 'release_group_id': candidate.get('release_group_id'), + 'source_ids': candidate.get('source_ids') or {} + } + matched_metadata = self.metadata_normalizer.normalize_item( + { + **item, + 'matched_metadata_json': matched_metadata + } + ) + return self.task_store.update_task_item( + item['id'], + match_status='matched_fallback', + match_reason='manual_candidate_selected', + match_message='已手动确认匹配候选', + match_source=candidate.get('provider'), + match_confidence=candidate.get('score'), + match_is_authoritative=1 if candidate.get('is_authoritative') else 0, + matched_metadata_json=matched_metadata + ) + + +class PreprocessRetryService: + def __init__(self, task_store, preprocessor: Preprocessor): + self.task_store = task_store + self.preprocessor = preprocessor + + def retry_preprocess(self, item: dict) -> dict: + try: + audio_props = self.preprocessor.probe_audio(item['current_file_path']) + tags = self.preprocessor.read_tags(item['current_file_path']) + fingerprint = self.preprocessor.calculate_fingerprint(item['current_file_path']) + return self.task_store.update_task_item( + item['id'], + preprocess_status='completed' if tags else 'warning', + preprocess_reason=None if tags else 'metadata_failed', + preprocess_message='预处理已重新执行' if tags else '预处理完成,但元数据仍缺失', + audio_props_json=audio_props, + original_tags_json=tags, + acoustic_fingerprint=fingerprint.get('fingerprint'), + fingerprint_duration_seconds=fingerprint.get('duration_seconds') + ) + except PreprocessItemError as error: + return self.task_store.update_task_item( + item['id'], + preprocess_status='failed', + preprocess_reason=error.reason, + preprocess_message=error.message + ) + + +class DedupeDecisionService: + def __init__(self, task_store, organize_service: OrganizeService): + self.task_store = task_store + self.organize_service = organize_service + + def keep_existing(self, item: dict, *, task_id: str, trash_root: str) -> dict: + if item.get('trash_file_path'): + return self.task_store.update_task_item( + item['id'], + dedupe_status='duplicate_trashed', + dedupe_message='已保留库内文件' + ) + trashed_path = self.organize_service.move_to_review_trash( + trash_root=trash_root, + task_id=task_id, + item_id=item['id'], + source_path=item['current_file_path'], + reason='duplicates' + ) + return self.task_store.update_task_item( + item['id'], + is_active=0, + current_file_path=trashed_path, + trash_file_path=trashed_path, + dedupe_status='duplicate_trashed', + dedupe_reason='manual_keep_existing', + dedupe_message='已保留库内文件,当前文件移入 review trash' + ) + + def replace_existing( + self, + item: dict, + *, + task_id: str, + output_root: str, + trash_root: str + ) -> tuple[dict, dict]: + existing_path = item.get('duplicate_of_path') + if not existing_path: + raise RepairExecutionError('duplicate_target_missing', '缺少库内重复文件路径') + existing = Path(existing_path) + if not existing.exists(): + raise RepairExecutionError('duplicate_target_missing', f'库内文件不存在: {existing}') + + replaced_path = self.organize_service.move_to_review_trash( + trash_root=trash_root, + task_id=task_id, + item_id=item['id'], + source_path=str(existing), + reason='duplicates' + ) + current_path = Path(item['current_file_path']) + if not current_path.exists(): + raise RepairExecutionError('source_missing', f'源文件不存在: {current_path}') + existing.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(current_path), str(existing)) + final_item = self.task_store.update_task_item( + item['id'], + current_file_path=str(existing.resolve(strict=False)), + filename=existing.name, + dedupe_status='duplicate_replaced', + dedupe_reason='replaced_library_duplicate', + dedupe_message='已替换库内旧文件', + library_relative_path=existing.relative_to( + Path(output_root).expanduser().resolve(strict=False) + ).as_posix() if str(existing).startswith(str(Path(output_root).expanduser().resolve(strict=False))) else item.get('library_relative_path'), + library_file_path=str(existing.resolve(strict=False)), + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': 'manual_replace', + 'quality_breakdown': { + 'kept': _build_quality_breakdown(item), + 'replaced': {'total': None} + }, + 'kept_side': 'batch', + 'trashed_path': replaced_path, + 'replaced_existing_path': str(existing.resolve(strict=False)), + 'compared_candidates': [ + _serialize_compared_candidate('kept', item), + {'side': 'replaced', 'path': str(existing.resolve(strict=False))} + ] + } + ) + return final_item, {'replaced_path': replaced_path, 'final_path': str(existing.resolve(strict=False))} + + def keep_both_with_rename( + self, + item: dict, + *, + output_root: str + ) -> tuple[dict, dict]: + return self.organize_service.organize_item(item, output_root=output_root) + + +class RepairService: + def __init__( + self, + task_store, + exception_service: ExceptionService, + matcher: Matcher, + preprocessor: Preprocessor, + task_stream, + runner=None + ): + self.task_store = task_store + self.exception_service = exception_service + self.metadata_normalizer = MetadataNormalizationService(task_store) + self.metadata_service = MetadataPatchService(task_store, self.metadata_normalizer) + self.organize_service = OrganizeService(task_store, self.metadata_normalizer) + self.match_service = MatchRetryService(task_store, matcher, self.metadata_normalizer) + self.preprocess_service = PreprocessRetryService(task_store, preprocessor) + self.dedupe_service = DedupeDecisionService(task_store, self.organize_service) + self.task_stream = task_stream + self.runner = runner + + def preview(self, payload: dict, config_snapshot: dict) -> dict: + items = self._load_exception_items(payload['exception_ids']) + action = payload['action'] + params = payload.get('params') or {} + + self._validate_batch(items, action) + + preview_items = [] + planned_operations = [] + warnings: list[str] = [] + risk_level = 'low' + + for item in items: + item_operations, item_warnings, item_risk = self._preview_item(item, action, params, config_snapshot) + preview_items.append( + { + 'exception_id': item['exception_id'], + 'filename': item['filename'], + 'exception_type': item['exception_type'], + 'planned_operations': item_operations, + 'warnings': item_warnings + } + ) + planned_operations.extend(item_operations) + warnings.extend(item_warnings) + risk_level = self._merge_risk(risk_level, item_risk) + + return { + 'action': action, + 'items': preview_items, + 'requires_confirmation': True, + 'planned_operations': planned_operations, + 'conflict_summary': { + 'item_count': len(preview_items), + 'mixed_types': len({item['exception_type'] for item in preview_items}) > 1 + }, + 'risk_level': risk_level, + 'warnings': warnings + } + + def execute(self, payload: dict, config_snapshot: dict) -> dict: + items = self._load_exception_items(payload['exception_ids'], require_open=True) + preview = self.preview(payload, config_snapshot) + task = self.task_store.create_task_if_idle( + config_snapshot, + trigger_source='manual_ui', + task_type='repair', + source_task_id=items[0]['task_id'] if items else None, + repair_plan_json={ + 'action': payload['action'], + 'params': payload.get('params') or {}, + 'items': [item['exception_id'] for item in items], + 'preview': preview + } + ) + + for item in items: + before_snapshot = self.exception_service.get_item(item['exception_id']) + previous_resolution = item.get('exception_resolution_json') or {} + self.task_store.update_task_item( + item['exception_id'], + exception_resolution_status='planned', + last_repair_task_id=task['task_id'], + exception_resolution_json={ + **previous_resolution, + 'action': payload['action'], + 'requested_at': current_timestamp(), + 'resolved_at': None, + 'repair_task_id': task['task_id'], + 'operator': 'manual_ui', + 'before_snapshot': before_snapshot, + 'after_snapshot': None, + 'notes': None, + 'planned_operations': [ + operation for operation in preview['planned_operations'] + if operation.get('source_path') == before_snapshot.get('current_file_path') + or operation.get('target_path') == before_snapshot.get('current_file_path') + ], + 'execution_result': None + } + ) + + return task + + def _validate_batch(self, items: list[dict], action: str): + if not items: + raise ValueError('至少选择一个异常项') + types = {item['exception_type'] for item in items} + if len(types) > 1: + raise ValueError('批量动作不支持混合异常类型') + for item in items: + if action not in (item.get('available_actions') or []): + raise ValueError(f'异常项 {item["exception_id"]} 不支持动作 {action}') + + def _load_exception_items(self, exception_ids: list[int], require_open: bool = False) -> list[dict]: + ids = list(dict.fromkeys(exception_ids or [])) + items = [self.exception_service.get_item(exception_id) for exception_id in ids] + if require_open: + for item in items: + if item.get('exception_resolution_status') != 'open': + raise ValueError(f'异常项 {item["exception_id"]} 当前不可执行') + return items + + def _preview_item(self, item: dict, action: str, params: dict, config_snapshot: dict): + current_path = item.get('current_file_path') + if action == 'ignore_exception': + return ( + [self._op('status_update', current_path, None, '标记为已忽略,不执行物理删除')], + ['真实执行仅做安全忽略或转入 review trash,不会物理删除源文件。'], + 'low' + ) + if action == 'delete_file': + return ( + [self._op('trash', current_path, None, '永久删除当前文件')], + ['该动作会真实删除文件,执行后无法恢复。'], + 'high' + ) + if action == 'edit_metadata': + return ([self._op('metadata_write', current_path, current_path, '写入元数据标签')], [], 'low') + if action == 'retry_match': + providers = params.get('providers') or [] + description = '重新执行单文件匹配' + if providers: + description = f'重新执行单文件匹配 ({"/".join(providers)})' + return ([self._op('status_update', current_path, None, description)], [], 'low') + if action == 'select_match_candidate': + return ([self._op('status_update', current_path, None, '确认现有匹配候选')], [], 'low') + if action == 'retry_preprocess': + return ([self._op('status_update', current_path, None, '重跑预处理与指纹提取')], [], 'low') + if action == 'move_to_review_trash': + return ([self._op('trash', current_path, None, '移动到 review trash')], [], 'medium') + if action == 'keep_existing': + return ([self._op('trash', current_path, item.get('trash_file_path'), '保留库内文件并移走当前文件')], [], 'medium') + if action == 'replace_existing': + return ( + [ + self._op('replace', item.get('duplicate_of_path'), None, '将库内旧文件移入 review trash'), + self._op('move', current_path, item.get('duplicate_of_path'), '当前文件覆盖进入库内目标') + ], + [], + 'high' + ) + if action in {'retry_organize', 'save_and_organize', 'keep_both_with_rename'}: + override = params.get('target_relative_path') if action == 'retry_organize' else None + if action == 'save_and_organize' and params.get('metadata_patch'): + item = { + **item, + 'matched_metadata_json': self.metadata_normalizer.normalize_item(item, params['metadata_patch']) + } + if action == 'save_and_organize' and not can_ingest_metadata(self.metadata_normalizer.normalize_item(item)): + raise ValueError('加入音乐库前必须补齐 title、artist、album_artist') + plan = self.organize_service.plan(item, config_snapshot['output'], override) + return ( + [ + self._op('move', current_path, str(Path(config_snapshot['output']) / plan['planned_relative_path']), '移动到目标库路径'), + self._op('status_update', current_path, None, f'更新入库路径 {plan["planned_relative_path"]}') + ], + [], + 'medium' + ) + raise ValueError(f'Unsupported action: {action}') + + def _merge_risk(self, current: str, next_risk: str) -> str: + order = {'low': 0, 'medium': 1, 'high': 2} + return next_risk if order[next_risk] > order[current] else current + + def _op(self, op_type: str, source_path: str | None, target_path: str | None, description: str) -> dict: + return { + 'type': op_type, + 'source_path': source_path, + 'target_path': target_path, + 'description': description + } + + +class RepairRunner: + def __init__(self, task_store, task_stream, repair_service: RepairService): + self.task_store = task_store + self.task_stream = task_stream + self.repair_service = repair_service + + def start_task(self, repair_task_id: str, config_snapshot: dict): + task = self.task_store.get_task(repair_task_id) + plan = task.get('repair_plan_json') or {} + stats = create_empty_repair_stats() + stage_states = create_pending_repair_stage_states() + + try: + stage_states['prepare'] = STAGE_STATUS_RUNNING + self.task_store.update_task( + repair_task_id, + status=TASK_STATUS_RUNNING, + current_stage='prepare', + stage_states=stage_states, + stats=stats + ) + self._log(repair_task_id, 'prepare', 'info', 'stage.started', '开始准备 repair 执行') + stats['prepare']['previewed_items'] = len(plan.get('items') or []) + stage_states['prepare'] = STAGE_STATUS_COMPLETED + stage_states['execute'] = STAGE_STATUS_RUNNING + self.task_store.update_task( + repair_task_id, + status=TASK_STATUS_RUNNING, + current_stage='execute', + stage_states=stage_states, + stats=stats + ) + self._broadcast(repair_task_id, 'stage.completed', 'prepare', {'stats': stats}) + + for exception_id in plan.get('items') or []: + try: + self._apply_action_to_item(repair_task_id, exception_id, plan['action'], plan.get('params') or {}, config_snapshot, stats) + stats['execute']['succeeded_items'] += 1 + except Exception as error: + stats['execute']['failed_items'] += 1 + item = self.task_store.get_exception_source_item(exception_id) + if item: + resolution = dict(item.get('exception_resolution_json') or {}) + resolution['resolved_at'] = current_timestamp() + resolution['execution_result'] = {'status': 'failed', 'message': str(error)} + self.task_store.update_task_item( + exception_id, + exception_resolution_status='open', + exception_resolution_json=resolution + ) + self._log(repair_task_id, 'execute', 'error', 'repair.item_failed', f'异常项执行失败: {exception_id}', {'exception_id': exception_id, 'error': str(error)}) + + stage_states['execute'] = STAGE_STATUS_COMPLETED + stage_states['complete'] = STAGE_STATUS_COMPLETED + completed_at = current_timestamp() + self.task_store.update_task( + repair_task_id, + status=TASK_STATUS_COMPLETED, + current_stage='complete', + stage_states=stage_states, + stats=stats, + completed_at=completed_at + ) + self._broadcast(repair_task_id, 'task.completed', 'complete', {'stats': stats}) + self._log(repair_task_id, 'complete', 'success', 'task.completed', 'repair 任务已完成', {'stats': stats}) + except Exception as error: + stage_states['prepare'] = STAGE_STATUS_FAILED if stage_states['prepare'] == STAGE_STATUS_RUNNING else stage_states['prepare'] + stage_states['execute'] = STAGE_STATUS_FAILED if stage_states['execute'] == STAGE_STATUS_RUNNING else stage_states['execute'] + stage_states['complete'] = STAGE_STATUS_FAILED + self.task_store.update_task( + repair_task_id, + status=TASK_STATUS_FAILED, + current_stage='execute', + stage_states=stage_states, + stats=stats, + error_message=str(error), + completed_at=current_timestamp() + ) + self._broadcast(repair_task_id, 'task.failed', 'execute', {'error_message': str(error), 'stats': stats}) + self._log(repair_task_id, 'execute', 'error', 'task.failed', f'repair 任务失败: {error}', {'error': str(error)}) + + def _apply_action_to_item(self, repair_task_id: str, exception_id: int, action: str, params: dict, config_snapshot: dict, stats: dict): + item = self.task_store.get_exception_source_item(exception_id) + if item is None: + raise RepairExecutionError('item_missing', f'异常项不存在: {exception_id}') + before_snapshot = self.repair_service.exception_service.get_item(exception_id) + final_item = item + execution_result = {'action': action, 'status': 'completed'} + + if action == 'ignore_exception': + stats['execute']['ignored_items'] += 1 + resolution_status = 'ignored' + workflow_state = 'ignored' + elif action == 'edit_metadata': + final_item, changed = self.repair_service.metadata_service.apply(item, params.get('metadata_patch') or {}) + if changed: + stats['execute']['updated_metadata_items'] += 1 + resolution_status = 'open' + workflow_state = _metadata_workflow_state(final_item, params.get('metadata_patch') or {}) + elif action == 'retry_match': + final_item = self.repair_service.match_service.retry_match( + item, + config_snapshot, + providers=params.get('providers') or None + ) + resolution_status = 'open' + workflow_state = _metadata_workflow_state(final_item) + elif action == 'select_match_candidate': + final_item = self.repair_service.match_service.select_candidate(item, int(params.get('candidate_index', -1))) + resolution_status = 'open' + workflow_state = _candidate_workflow_state(final_item) + elif action == 'retry_preprocess': + final_item = self.repair_service.preprocess_service.retry_preprocess(item) + resolution_status = 'resolved' if final_item.get('preprocess_status') != 'failed' else 'open' + workflow_state = 'ingested' if resolution_status == 'resolved' else 'open' + elif action == 'move_to_review_trash': + trashed_path = self.repair_service.organize_service.move_to_review_trash( + trash_root=config_snapshot['trash'], + task_id=repair_task_id, + item_id=item['id'], + source_path=item['current_file_path'], + reason='manual_review' + ) + final_item = self.task_store.update_task_item( + item['id'], + is_active=0, + current_file_path=trashed_path, + trash_file_path=trashed_path, + organize_status='trashed', + organize_reason='manual_review', + organize_message='已移入 review trash' + ) + stats['execute']['moved_items'] += 1 + resolution_status = 'resolved' + workflow_state = 'ingested' + elif action == 'keep_existing': + final_item = self.repair_service.dedupe_service.keep_existing( + item, + task_id=repair_task_id, + trash_root=config_snapshot['trash'] + ) + stats['execute']['moved_items'] += 1 + resolution_status = 'resolved' + workflow_state = 'ingested' + elif action == 'replace_existing': + final_item, execution_result = self.repair_service.dedupe_service.replace_existing( + item, + task_id=repair_task_id, + output_root=config_snapshot['output'], + trash_root=config_snapshot['trash'] + ) + stats['execute']['moved_items'] += 1 + resolution_status = 'resolved' + workflow_state = 'ingested' + elif action == 'keep_both_with_rename': + final_item, execution_result = self.repair_service.dedupe_service.keep_both_with_rename( + item, + output_root=config_snapshot['output'] + ) + stats['execute']['moved_items'] += 1 + resolution_status = 'resolved' + workflow_state = 'ingested' + elif action == 'retry_organize': + final_item, execution_result = self.repair_service.organize_service.organize_item( + item, + output_root=config_snapshot['output'], + override_relative_path=params.get('target_relative_path') + ) + stats['execute']['moved_items'] += 1 + resolution_status = 'resolved' + workflow_state = 'ingested' + elif action == 'save_and_organize': + patched_item, changed = self.repair_service.metadata_service.apply(item, params.get('metadata_patch') or {}) + if changed: + stats['execute']['updated_metadata_items'] += 1 + if not can_ingest_metadata(self.repair_service.metadata_normalizer.normalize_item(patched_item)): + raise RepairExecutionError('metadata_incomplete', '加入音乐库前必须补齐 title、artist、album_artist') + final_item, execution_result = self.repair_service.organize_service.organize_item( + patched_item, + output_root=config_snapshot['output'] + ) + stats['execute']['moved_items'] += 1 + resolution_status = 'resolved' + workflow_state = 'ingested' + elif action == 'delete_file': + file_path = Path(item['current_file_path']) + if not file_path.exists(): + raise RepairExecutionError('source_missing', f'源文件不存在: {file_path}') + file_path.unlink() + final_item = self.task_store.update_task_item( + item['id'], + is_active=0, + organize_status='deleted', + organize_reason='manual_delete', + organize_message='文件已被永久删除' + ) + resolution_status = 'resolved' + workflow_state = 'deleted' + else: + raise ValueError(f'Unsupported action: {action}') + + after_snapshot = self.repair_service.exception_service.get_item(exception_id) if resolution_status == 'open' else { + **before_snapshot, + 'current_file_path': final_item.get('current_file_path'), + 'trash_file_path': final_item.get('trash_file_path'), + 'library_relative_path': final_item.get('library_relative_path'), + 'library_file_path': final_item.get('library_file_path'), + 'matched_metadata_json': final_item.get('matched_metadata_json'), + 'original_tags_json': final_item.get('original_tags_json') + } + resolution = dict(final_item.get('exception_resolution_json') or {}) + resolution.update( + { + 'resolved_at': current_timestamp(), + 'workflow_state': workflow_state, + 'metadata_draft': _build_metadata_draft(final_item, params.get('metadata_patch') or {}, workflow_state), + 'after_snapshot': after_snapshot, + 'execution_result': execution_result + } + ) + if action == 'keep_both_with_rename': + resolution['secondary_version_retained'] = True + + self.task_store.update_task_item( + final_item['id'], + exception_resolution_status=resolution_status, + exception_resolution_json=resolution, + last_repair_task_id=repair_task_id + ) + self._log(repair_task_id, 'execute', 'success', 'repair.item_completed', f'异常项执行完成: {exception_id}', {'exception_id': exception_id, 'action': action, 'resolution_status': resolution_status}) + self._broadcast(repair_task_id, 'repair.progress', 'execute', {'stats': stats, 'exception_id': exception_id}) + + def _log(self, task_id: str, stage: str, level: str, event_type: str, message: str, payload: dict | None = None): + persisted_log = self.task_store.append_log(task_id, stage, level, event_type, message, payload) + self._broadcast(task_id, 'log.appended', stage, {'log': persisted_log}) + + def _broadcast(self, task_id: str, event_type: str, stage: str, data: dict): + self.task_stream.broadcast_event(task_id, event_type, stage, data) + + +def _merge_metadata(item: dict, metadata_patch: dict | None = None) -> dict: + return merge_metadata_layers( + item.get('original_tags_json'), + item.get('matched_metadata_json'), + metadata_patch + ) + + +def _can_ingest(metadata: dict) -> bool: + return can_ingest_metadata(metadata) + + +def _metadata_workflow_state(item: dict, metadata_patch: dict | None = None) -> str: + metadata = item.get('matched_metadata_json') or {} + if metadata_patch: + metadata = {**metadata, **{key: value for key, value in metadata_patch.items() if value is not None}} + return 'ready_to_ingest' if _can_ingest(metadata) else 'open' + + +def _candidate_workflow_state(item: dict) -> str: + return 'ready_to_ingest' if _can_ingest(item.get('matched_metadata_json') or {}) else 'candidate_selected' + + +def _build_metadata_draft(item: dict, metadata_patch: dict | None, workflow_state: str) -> dict | None: + if workflow_state not in {'candidate_selected', 'ready_to_ingest'} and not metadata_patch: + return None + metadata = dict(item.get('matched_metadata_json') or {}) + metadata.update({key: value for key, value in (metadata_patch or {}).items() if value is not None}) + return metadata diff --git a/backend/app/scanner.py b/backend/app/scanner.py new file mode 100644 index 0000000..5461f5f --- /dev/null +++ b/backend/app/scanner.py @@ -0,0 +1,322 @@ +import os +import time +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Callable + +from .task_constants import ( + SCAN_PROGRESS_BATCH_SIZE, + SCAN_PROGRESS_INTERVAL_SECONDS, + create_empty_scan_stats +) + + +COVER_IMAGE_EXTENSIONS = ('.jpg', '.png', '.jpeg', '.webp') +ALLOWED_AUDIO_EXTENSIONS = ('.flac', '.mp3', '.m4a', '.wav', '.ape', '.aac', '.ogg') +IGNORED_FILENAMES = ('.ds_store', 'thumbs.db', 'desktop.ini') +IGNORED_EXTENSIONS = ('.txt', '.nfo') +_COVER_BASENAMES = ('cover', 'folder') +_LYRIC_EXTENSION = '.lrc' + + +class ScannerError(Exception): + pass + + +@dataclass +class ScanItem: + original_path: str + relative_path: str + filename: str + extension: str + size_bytes: int | None + modified_at: str | None + local_cover: str | None + local_lyric: str | None + scan_status: str + scan_reason: str | None + scan_message: str | None + + def to_dict(self) -> dict: + return asdict(self) + + +class Scanner: + def scan( + self, + input_dir: str, + *, + on_item: Callable[[ScanItem], None] | None = None, + on_progress: Callable[[dict[str, int]], None] | None = None, + on_log: Callable[[str, str, dict | None], None] | None = None + ) -> dict[str, int]: + input_root = Path(input_dir).expanduser().resolve(strict=False) + + if not input_root.exists(): + raise ScannerError(f'扫描目录不存在: {input_root}') + if not input_root.is_dir(): + raise ScannerError(f'扫描目录不是有效文件夹: {input_root}') + + stats = create_empty_scan_stats() + processed_candidates = 0 + last_progress_at = time.monotonic() + directory_stack = [input_root] + + while directory_stack: + current_directory = directory_stack.pop() + + try: + entries = sorted( + list(os.scandir(current_directory)), + key=lambda entry: entry.name.lower() + ) + except OSError as error: + if on_log is not None: + on_log( + 'error', + f'无法读取目录: {current_directory}', + { + 'path': str(current_directory), + 'error': str(error) + } + ) + continue + + for entry in entries: + entry_path = Path(entry.path) + + if entry.is_symlink(): + continue + + if entry.is_dir(follow_symlinks=False): + directory_stack.append(entry_path) + continue + + if not entry.is_file(follow_symlinks=False): + continue + + filename_lower = entry.name.lower() + extension = entry_path.suffix.lower() + + if filename_lower in IGNORED_FILENAMES or extension in IGNORED_EXTENSIONS: + stats['ignored_non_audio'] += 1 + continue + + if extension not in ALLOWED_AUDIO_EXTENSIONS: + stats['ignored_non_audio'] += 1 + continue + + stats['total_found'] += 1 + processed_candidates += 1 + + item = self._build_item(input_root, entry_path, entry, extension) + if item.scan_status == 'queued': + stats['queued'] += 1 + elif item.scan_status == 'skipped_locked': + stats['skipped_locked'] += 1 + else: + stats['skipped_invalid'] += 1 + + if on_item is not None: + on_item(item) + + now = time.monotonic() + if ( + processed_candidates % SCAN_PROGRESS_BATCH_SIZE == 0 + or now - last_progress_at >= SCAN_PROGRESS_INTERVAL_SECONDS + ): + if on_progress is not None: + on_progress(stats.copy()) + last_progress_at = now + + if on_progress is not None: + on_progress(stats.copy()) + + return stats + + def _build_item( + self, + input_root: Path, + entry_path: Path, + entry: os.DirEntry, + extension: str + ) -> ScanItem: + absolute_path = entry_path.resolve(strict=False) + relative_path = absolute_path.relative_to(input_root).as_posix() + modified_at = None + size_bytes = None + + try: + entry_stat = entry.stat(follow_symlinks=False) + size_bytes = entry_stat.st_size + modified_at = _format_timestamp(entry_stat.st_mtime) + except FileNotFoundError: + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=None, + local_lyric=None, + scan_status='invalid', + scan_reason='path_disappeared', + scan_message='文件在扫描过程中消失' + ) + except OSError: + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=None, + local_lyric=None, + scan_status='invalid', + scan_reason='stat_failed', + scan_message='无法读取文件状态信息' + ) + + if time.time() - entry_stat.st_mtime < 60: + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=None, + local_lyric=None, + scan_status='skipped_locked', + scan_reason='recent_mtime', + scan_message='文件最近 60 秒内仍在变更,已跳过' + ) + + if not os.access(absolute_path, os.R_OK) or not os.access(absolute_path, os.W_OK): + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=None, + local_lyric=None, + scan_status='invalid', + scan_reason='permission_denied', + scan_message='当前进程缺少读写权限' + ) + + try: + with absolute_path.open('rb') as file_handle: + file_handle.read(1) + except FileNotFoundError: + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=None, + local_lyric=None, + scan_status='invalid', + scan_reason='path_disappeared', + scan_message='文件在读取前已消失' + ) + except OSError: + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=None, + local_lyric=None, + scan_status='invalid', + scan_reason='unreadable', + scan_message='文件无法读取' + ) + + assets = probe_local_assets(absolute_path) + return ScanItem( + original_path=str(absolute_path), + relative_path=relative_path, + filename=entry_path.name, + extension=extension, + size_bytes=size_bytes, + modified_at=modified_at, + local_cover=assets['local_cover'], + local_lyric=assets['local_lyric'], + scan_status='queued', + scan_reason=None, + scan_message=None + ) + + +def probe_local_assets(audio_path: str | Path) -> dict[str, str | None]: + audio_file = Path(audio_path) + audio_dir = audio_file.parent + + if not audio_dir.exists() or not audio_dir.is_dir(): + return {'local_cover': None, 'local_lyric': None} + + files_by_lower_name = _index_regular_files(audio_dir) + normalized_stem = audio_file.stem.lower() + + cover_candidates = [ + f'{basename}{extension}' + for basename in _COVER_BASENAMES + for extension in COVER_IMAGE_EXTENSIONS + ] + cover_candidates.extend( + f'{normalized_stem}{extension}' for extension in COVER_IMAGE_EXTENSIONS + ) + + cover_path = _first_existing_path(files_by_lower_name, cover_candidates) + lyric_path = _first_existing_path( + files_by_lower_name, + [f'{normalized_stem}{_LYRIC_EXTENSION}'] + ) + + return { + 'local_cover': str(cover_path) if cover_path else None, + 'local_lyric': str(lyric_path) if lyric_path else None + } + + +def _index_regular_files(directory: Path) -> dict[str, Path]: + indexed_files: dict[str, Path] = {} + + for entry in sorted(directory.iterdir(), key=lambda path: path.name.lower()): + if entry.is_symlink() or not entry.is_file(): + continue + + indexed_files.setdefault(entry.name.lower(), entry.resolve(strict=False)) + + return indexed_files + + +def _first_existing_path( + files_by_lower_name: dict[str, Path], + candidates: list[str] +) -> Path | None: + for candidate in candidates: + matched_path = files_by_lower_name.get(candidate.lower()) + if matched_path is not None: + return matched_path + + return None + + +def _format_timestamp(timestamp: float) -> str: + return ( + datetime.fromtimestamp(timestamp, tz=timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace('+00:00', 'Z') + ) diff --git a/backend/app/schemas.py b/backend/app/schemas.py new file mode 100644 index 0000000..3c7f3e9 --- /dev/null +++ b/backend/app/schemas.py @@ -0,0 +1,423 @@ +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class ScheduleConfig(BaseModel): + model_config = ConfigDict(extra='allow') + + enabled: bool = True + type: str = 'daily' + dayOfWeek: str = '1' + time: str = '02:00' + cron: str = '0 2 * * *' + + +class AdvancedStrategyConfig(BaseModel): + metadataFallback: bool = True + downloadAssets: bool = True + replaceLowQualityDuplicates: bool = False + + +class NotificationConfig(BaseModel): + dingtalkWebhook: str = '' + dingtalkSecret: str = '' + telegramBotToken: str = '' + telegramChatId: str = '' + emailSmtp: str = '' + emailUser: str = '' + emailPass: str = '' + emailTo: str = '' + + +class MetadataConfig(BaseModel): + acoustidUrl: str = 'https://api.acoustid.org/v2' + acoustidClientKey: str = '' + musicbrainz: str = 'https://musicbrainz.org/ws/2/' + netease: str = 'http://localhost:3000' + qq: str = 'http://localhost:3300' + spotifyUrl: str = 'https://api.spotify.com/v1' + spotifyClientId: str = '' + spotifySecret: str = '' + discogsUrl: str = 'https://api.discogs.com' + discogsToken: str = '' + lastfmUrl: str = 'https://ws.audioscrobbler.com/2.0/' + lastfmKey: str = '' + geniusUrl: str = 'https://api.genius.com' + geniusToken: str = '' + + +class ConfigPayload(BaseModel): + input: str = Field(default='') + output: str = Field(default='') + trash: str = Field(default='') + schedule: ScheduleConfig = Field(default_factory=ScheduleConfig) + advancedStrategy: AdvancedStrategyConfig = Field(default_factory=AdvancedStrategyConfig) + notifications: NotificationConfig = Field(default_factory=NotificationConfig) + metadata: MetadataConfig = Field(default_factory=MetadataConfig) + + @model_validator(mode='after') + def validate_config(self): + required_paths = { + 'input': self.input, + 'output': self.output, + 'trash': self.trash + } + + for field_name, value in required_paths.items(): + if not isinstance(value, str): + raise ValueError(f'{field_name} must be a string') + + if not isinstance(self.schedule.cron, str) or not self.schedule.cron.strip(): + raise ValueError('schedule.cron must not be empty') + + return self + + +class MetadataStatusPayload(BaseModel): + status: Literal['checking', 'online', 'warning', 'offline', 'none', 'idle'] + latencyMs: int | None = None + message: str + + +class MetadataStatusResponse(BaseModel): + metadataStatus: dict[str, MetadataStatusPayload] + + +class ConfigSaveResponse(BaseModel): + config: ConfigPayload + metadataStatus: dict[str, MetadataStatusPayload] + + +class TaskSummaryPayload(BaseModel): + task_id: str + task_type: Literal['ingest', 'repair'] + trigger_source: str + source_task_id: str | None = None + status: Literal['pending', 'running', 'completed', 'failed'] + current_stage: str + stage_states: dict[str, Literal['pending', 'running', 'completed', 'skipped', 'failed']] + stats: dict[str, dict[str, int]] + repair_plan_json: dict[str, Any] | None = None + error_message: str | None = None + started_at: str + completed_at: str | None = None + updated_at: str + + +class TaskRunResponse(BaseModel): + task_id: str + status: Literal['pending', 'running', 'completed', 'failed'] + current_stage: str + stage_states: dict[str, Literal['pending', 'running', 'completed', 'skipped', 'failed']] + started_at: str + + +class TaskCurrentResponse(BaseModel): + task: TaskSummaryPayload | None = None + + +class TaskDetailResponse(BaseModel): + task: TaskSummaryPayload + + +class TaskHistoryListItemPayload(BaseModel): + task_id: str + started_at: str + status: Literal['completed', 'failed'] + total_items: int + success_items: int + exception_items: int + report_status: Literal['success', 'warning'] + + +class TaskHistoryListResponse(BaseModel): + items: list[TaskHistoryListItemPayload] + page: int + page_size: int + total: int + + +class TaskItemPayload(BaseModel): + id: int + task_id: str + parent_item_id: int | None = None + is_active: bool + original_path: str + current_file_path: str + relative_path: str + filename: str + extension: str + size_bytes: int | None = None + modified_at: str | None = None + local_cover: str | None = None + local_lyric: str | None = None + scan_status: Literal['queued', 'skipped_locked', 'invalid'] + scan_reason: Literal[ + 'recent_mtime', + 'permission_denied', + 'stat_failed', + 'path_disappeared', + 'unreadable' + ] | None = None + scan_message: str | None = None + preprocess_status: str + preprocess_reason: str | None = None + preprocess_message: str | None = None + audio_props_json: dict[str, Any] | None = None + original_tags_json: dict[str, Any] | None = None + preprocess_artifacts_json: dict[str, Any] | None = None + acoustic_fingerprint: str | None = None + fingerprint_duration_seconds: float | None = None + match_status: Literal[ + 'pending', + 'running', + 'matched', + 'matched_fallback', + 'low_score', + 'not_found', + 'failed' + ] + match_reason: str | None = None + match_message: str | None = None + match_source: str | None = None + match_confidence: float | None = None + match_is_authoritative: bool + matched_metadata_json: dict[str, Any] | None = None + match_candidates_json: list[dict[str, Any]] | None = None + match_enrichment_json: dict[str, Any] | None = None + dedupe_status: Literal[ + 'pending', + 'running', + 'unique', + 'duplicate_trashed', + 'duplicate_replaced', + 'failed' + ] + dedupe_reason: str | None = None + dedupe_message: str | None = None + dedupe_group_key: str | None = None + duplicate_of_path: str | None = None + duplicate_of_item_id: int | None = None + dedupe_decision_json: dict[str, Any] | None = None + organize_status: Literal['pending', 'running', 'organized', 'trashed', 'failed'] + organize_reason: str | None = None + organize_message: str | None = None + library_relative_path: str | None = None + library_file_path: str | None = None + trash_file_path: str | None = None + organize_decision_json: dict[str, Any] | None = None + created_at: str + updated_at: str + + +class TaskItemsPageResponse(BaseModel): + items: list[TaskItemPayload] + page: int + page_size: int + total: int + + +class TaskLogPayload(BaseModel): + id: int + task_id: str + stage: str + level: Literal['info', 'warning', 'error', 'success'] + event_type: str + message: str + payload: dict | None = None + created_at: str + + +class TaskLogsPageResponse(BaseModel): + logs: list[TaskLogPayload] + page: int + page_size: int + total: int + + +class ExceptionSummaryPayload(BaseModel): + total: int + counts_by_type: dict[str, int] + scanned_at: str + + +class ExceptionListItemPayload(BaseModel): + exception_id: int + task_id: str + task_started_at: str + exception_type: Literal[ + 'missing_tags', + 'duplicates', + 'match_failed', + 'low_score', + 'convert_failed', + 'organize_failed' + ] + exception_stage: Literal['preprocess', 'match', 'dedupe', 'organize'] + exception_reason_code: str | None = None + exception_message: str | None = None + captured_at: str + filename: str + relative_path: str + original_path: str + current_file_path: str + trash_file_path: str | None = None + audio_props_json: dict[str, Any] | None = None + original_tags_json: dict[str, Any] | None = None + matched_metadata_json: dict[str, Any] | None = None + duplicate_of_path: str | None = None + dedupe_decision_json: dict[str, Any] | None = None + library_relative_path: str | None = None + library_file_path: str | None = None + match_source: str | None = None + match_confidence: float | None = None + preview_available: bool + available_actions: list[str] + exception_resolution_status: Literal['open', 'planned', 'resolved', 'ignored'] + exception_resolution_json: dict[str, Any] | None = None + workflow_state: Literal[ + 'open', + 'candidate_selected', + 'ready_to_ingest', + 'ingested', + 'ignored', + 'deleted' + ] + raw_metadata: dict[str, Any] + metadata_draft: dict[str, Any] + effective_metadata: dict[str, Any] + normalization_strategy: str | None = None + album_artist_reason: str | None = None + compilation: int = 0 + can_ingest: bool + pending_ingest: bool + display_title: str + display_reason: str + type_label: str + + +class ExceptionDetailPayload(ExceptionListItemPayload): + preprocess_artifacts_json: dict[str, Any] | None = None + match_candidates_json: list[dict[str, Any]] | None = None + match_enrichment_json: dict[str, Any] | None = None + organize_decision_json: dict[str, Any] | None = None + + +class ExceptionListResponse(BaseModel): + items: list[ExceptionListItemPayload] + page: int + page_size: int + total: int + + +class MetadataPatchPayload(BaseModel): + title: str | None = None + artist: str | None = None + album: str | None = None + album_artist: str | None = None + track_number: int | None = None + disc_number: int | None = None + year: int | None = None + lyrics: str | None = None + + +class RepairPreviewRequest(BaseModel): + exception_ids: list[int] + action: str + params: dict[str, Any] = Field(default_factory=dict) + + +class RepairExecuteRequest(RepairPreviewRequest): + pass + + +class PlannedOperationPayload(BaseModel): + type: Literal['move', 'replace', 'rename', 'metadata_write', 'trash', 'status_update'] + source_path: str | None = None + target_path: str | None = None + description: str + + +class RepairPreviewItemPayload(BaseModel): + exception_id: int + filename: str + exception_type: str + planned_operations: list[PlannedOperationPayload] + warnings: list[str] = Field(default_factory=list) + + +class RepairPreviewResponse(BaseModel): + action: str + items: list[RepairPreviewItemPayload] + requires_confirmation: bool + planned_operations: list[PlannedOperationPayload] + conflict_summary: dict[str, Any] + risk_level: Literal['low', 'medium', 'high'] + warnings: list[str] + + +class RepairTaskRunResponse(BaseModel): + repair_task_id: str + status: Literal['pending', 'running', 'completed', 'failed'] + current_stage: str + stage_states: dict[str, Literal['pending', 'running', 'completed', 'skipped', 'failed']] + started_at: str + + +class RepairTaskCurrentResponse(BaseModel): + task: TaskSummaryPayload | None = None + + +class LibraryIngestProvenancePayload(BaseModel): + task_id: str + organized_at: str + match_source: str | None = None + match_confidence: float | None = None + dedupe_status: str | None = None + + +class LibrarySummaryPayload(BaseModel): + total_tracks: int + total_albums: int + total_artists: int + suspected_duplicates: int + scanned_at: str + + +class LibraryTrackPayload(BaseModel): + track_id: str + library_relative_path: str + library_file_path: str + filename: str + title: str | None = None + artist: str | None = None + album: str | None = None + album_artist: str | None = None + track_number: int | None = None + disc_number: int | None = None + year: int | None = None + duration_seconds: float | int | None = None + format: str | None = None + codec: str | None = None + bitrate: int | None = None + sample_rate: int | None = None + bit_depth: int | None = None + channels: int | None = None + size_bytes: int | None = None + modified_at: str | None = None + ingest_provenance: LibraryIngestProvenancePayload | None = None + + +class LibraryTracksPageResponse(BaseModel): + items: list[LibraryTrackPayload] + page: int + page_size: int + total: int + + +class LibraryMoveToExceptionResponse(BaseModel): + exception_id: int + library_relative_path: str + trash_file_path: str + message: str diff --git a/backend/app/storage.py b/backend/app/storage.py new file mode 100644 index 0000000..ad16876 --- /dev/null +++ b/backend/app/storage.py @@ -0,0 +1,61 @@ +import json +import sqlite3 +from pathlib import Path + +from .defaults import create_default_config, merge_config + + +class ConfigStore: + def __init__(self, db_path: Path): + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._initialize() + + def _connect(self): + connection = sqlite3.connect(self.db_path) + connection.row_factory = sqlite3.Row + return connection + + def _initialize(self): + with self._connect() as connection: + connection.execute( + ''' + CREATE TABLE IF NOT EXISTS app_config ( + id INTEGER PRIMARY KEY CHECK (id = 1), + config_json TEXT NOT NULL, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ) + ''' + ) + connection.commit() + + def get_config(self): + with self._connect() as connection: + row = connection.execute( + 'SELECT config_json FROM app_config WHERE id = 1' + ).fetchone() + + if not row: + config = create_default_config() + self.save_config(config) + return config + + return merge_config(json.loads(row['config_json'])) + + def save_config(self, config): + normalized_config = merge_config(config) + + with self._connect() as connection: + connection.execute( + ''' + INSERT INTO app_config (id, config_json, updated_at) + VALUES (1, ?, CURRENT_TIMESTAMP) + ON CONFLICT(id) DO UPDATE SET + config_json = excluded.config_json, + updated_at = CURRENT_TIMESTAMP + ''', + (json.dumps(normalized_config),) + ) + connection.commit() + + return normalized_config diff --git a/backend/app/task_constants.py b/backend/app/task_constants.py new file mode 100644 index 0000000..594a244 --- /dev/null +++ b/backend/app/task_constants.py @@ -0,0 +1,312 @@ +from datetime import datetime, timezone + + +TASK_STATUS_PENDING = 'pending' +TASK_STATUS_RUNNING = 'running' +TASK_STATUS_COMPLETED = 'completed' +TASK_STATUS_FAILED = 'failed' +TASK_TYPE_INGEST = 'ingest' +TASK_TYPE_REPAIR = 'repair' + +STAGE_STATUS_PENDING = 'pending' +STAGE_STATUS_RUNNING = 'running' +STAGE_STATUS_COMPLETED = 'completed' +STAGE_STATUS_SKIPPED = 'skipped' +STAGE_STATUS_FAILED = 'failed' + +ACTIVE_TASK_STATUSES = (TASK_STATUS_PENDING, TASK_STATUS_RUNNING) +SCAN_PROGRESS_LOG_LIMIT = 50 +SCAN_PROGRESS_BATCH_SIZE = 20 +SCAN_PROGRESS_INTERVAL_SECONDS = 1.0 +PREPROCESS_PROGRESS_BATCH_SIZE = 20 +PREPROCESS_PROGRESS_INTERVAL_SECONDS = 1.0 +MATCH_PROGRESS_BATCH_SIZE = 20 +MATCH_PROGRESS_INTERVAL_SECONDS = 1.0 +DEDUPE_PROGRESS_BATCH_SIZE = 20 +DEDUPE_PROGRESS_INTERVAL_SECONDS = 1.0 +ORGANIZE_PROGRESS_BATCH_SIZE = 20 +ORGANIZE_PROGRESS_INTERVAL_SECONDS = 1.0 +TASK_WORKSPACE_ROOT = '/tmp/musicworkshop/tasks' + +STAGE_IDS = ( + 'scan', + 'preprocess', + 'match', + 'dedupe', + 'organize', + 'complete' +) + +REPAIR_STAGE_IDS = ( + 'prepare', + 'execute', + 'complete' +) + + +def current_timestamp() -> str: + return ( + datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace('+00:00', 'Z') + ) + + +def create_pending_stage_states() -> dict[str, str]: + return {stage_id: STAGE_STATUS_PENDING for stage_id in STAGE_IDS} + + +def create_pending_repair_stage_states() -> dict[str, str]: + return {stage_id: STAGE_STATUS_PENDING for stage_id in REPAIR_STAGE_IDS} + + +def create_scan_failed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_FAILED, + 'preprocess': STAGE_STATUS_PENDING, + 'match': STAGE_STATUS_PENDING, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_FAILED + } + + +def create_scan_completed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_PENDING, + 'match': STAGE_STATUS_PENDING, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_preprocess_running_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_RUNNING, + 'match': STAGE_STATUS_PENDING, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_preprocess_completed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_PENDING, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_preprocess_failed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_FAILED, + 'match': STAGE_STATUS_PENDING, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_FAILED + } + + +def create_match_running_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_RUNNING, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_match_completed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_match_failed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_FAILED, + 'dedupe': STAGE_STATUS_PENDING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_FAILED + } + + +def create_dedupe_running_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_RUNNING, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_dedupe_completed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_COMPLETED, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_dedupe_failed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_FAILED, + 'organize': STAGE_STATUS_PENDING, + 'complete': STAGE_STATUS_FAILED + } + + +def create_organize_running_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_COMPLETED, + 'organize': STAGE_STATUS_RUNNING, + 'complete': STAGE_STATUS_PENDING + } + + +def create_organize_completed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_COMPLETED, + 'organize': STAGE_STATUS_COMPLETED, + 'complete': STAGE_STATUS_PENDING + } + + +def create_organize_failed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_COMPLETED, + 'organize': STAGE_STATUS_FAILED, + 'complete': STAGE_STATUS_FAILED + } + + +def create_task_completed_stage_states() -> dict[str, str]: + return { + 'scan': STAGE_STATUS_COMPLETED, + 'preprocess': STAGE_STATUS_COMPLETED, + 'match': STAGE_STATUS_COMPLETED, + 'dedupe': STAGE_STATUS_COMPLETED, + 'organize': STAGE_STATUS_COMPLETED, + 'complete': STAGE_STATUS_COMPLETED + } + + +def create_empty_scan_stats() -> dict[str, int]: + return { + 'total_found': 0, + 'queued': 0, + 'skipped_locked': 0, + 'skipped_invalid': 0, + 'ignored_non_audio': 0 + } + + +def create_empty_preprocess_stats() -> dict[str, int]: + return { + 'input_items': 0, + 'output_items': 0, + 'split_parents': 0, + 'generated_children': 0, + 'converted_items': 0, + 'metadata_snapshots': 0, + 'fingerprints_ok': 0, + 'fingerprints_failed': 0, + 'failed_items': 0, + 'warning_items': 0 + } + + +def create_empty_match_stats() -> dict[str, int]: + return { + 'input_items': 0, + 'matched_authoritative': 0, + 'matched_fallback': 0, + 'low_score': 0, + 'not_found': 0, + 'provider_warnings': 0, + 'failed_items': 0 + } + + +def create_empty_dedupe_stats() -> dict[str, int]: + return { + 'input_items': 0, + 'library_candidates': 0, + 'batch_duplicates': 0, + 'library_duplicates': 0, + 'replaced_library_items': 0, + 'kept_items': 0, + 'failed_items': 0 + } + + +def create_empty_organize_stats() -> dict[str, int]: + return { + 'input_items': 0, + 'moved_items': 0, + 'renamed_items': 0, + 'collision_resolved': 0, + 'trashed_items': 0, + 'failed_items': 0 + } + + +def create_empty_task_stats() -> dict[str, dict[str, int]]: + return { + 'scan': create_empty_scan_stats(), + 'preprocess': create_empty_preprocess_stats(), + 'match': create_empty_match_stats(), + 'dedupe': create_empty_dedupe_stats(), + 'organize': create_empty_organize_stats() + } + + +def create_empty_repair_stats() -> dict[str, dict[str, int]]: + return { + 'prepare': { + 'previewed_items': 0, + 'rejected_items': 0 + }, + 'execute': { + 'succeeded_items': 0, + 'failed_items': 0, + 'moved_items': 0, + 'updated_metadata_items': 0, + 'ignored_items': 0 + } + } diff --git a/backend/app/task_runner.py b/backend/app/task_runner.py new file mode 100644 index 0000000..3ac0991 --- /dev/null +++ b/backend/app/task_runner.py @@ -0,0 +1,1412 @@ +import shutil +import time +from datetime import datetime, timezone +from pathlib import Path + +from .defaults import merge_config +from .library_postprocess import ( + DedupeRunner, + OrganizeRunner, + _build_prefixed_name, + _build_unique_destination +) +from .matcher import MatchProviderError, Matcher +from .preprocessor import ( + FORCED_FLAC_EXTENSIONS, + PreprocessItemError, + Preprocessor, + build_preprocess_paths, + build_split_child_relative_path, + merge_tag_snapshots +) +from .scanner import ScanItem, Scanner +from .task_constants import ( + MATCH_PROGRESS_BATCH_SIZE, + MATCH_PROGRESS_INTERVAL_SECONDS, + PREPROCESS_PROGRESS_BATCH_SIZE, + PREPROCESS_PROGRESS_INTERVAL_SECONDS, + STAGE_STATUS_RUNNING, + TASK_STATUS_COMPLETED, + TASK_STATUS_FAILED, + TASK_STATUS_RUNNING, + current_timestamp, + create_dedupe_completed_stage_states, + create_dedupe_failed_stage_states, + create_dedupe_running_stage_states, + create_empty_task_stats, + create_match_completed_stage_states, + create_match_failed_stage_states, + create_match_running_stage_states, + create_organize_completed_stage_states, + create_organize_failed_stage_states, + create_organize_running_stage_states, + create_pending_stage_states, + create_preprocess_completed_stage_states, + create_preprocess_failed_stage_states, + create_preprocess_running_stage_states, + create_task_completed_stage_states, + create_scan_completed_stage_states, + create_scan_failed_stage_states +) +from .task_store import TaskStore +from .task_stream import TaskStreamManager + + +class TaskRunner: + def __init__( + self, + task_store: TaskStore, + scanner: Scanner, + preprocessor: Preprocessor, + task_stream: TaskStreamManager, + matcher: Matcher | None = None, + dedupe_runner: DedupeRunner | None = None, + organize_runner: OrganizeRunner | None = None + ): + self.task_store = task_store + self.scanner = scanner + self.preprocessor = preprocessor + self.task_stream = task_stream + self.matcher = matcher or Matcher() + self.dedupe_runner = dedupe_runner or DedupeRunner(task_store, preprocessor, task_stream) + self.organize_runner = organize_runner or OrganizeRunner(task_store, task_stream) + + def start_task(self, task_id: str, config_snapshot: dict): + normalized_config = merge_config(config_snapshot) + current_stats = create_empty_task_stats() + failure_stage = 'scan' + + running_stage_states = create_pending_stage_states() + running_stage_states['scan'] = STAGE_STATUS_RUNNING + + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='scan', + stage_states=running_stage_states, + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'task.started', + 'system', + {'status': TASK_STATUS_RUNNING, 'current_stage': 'scan'} + ) + self._append_log( + task_id, + stage='system', + level='info', + event_type='task.started', + message='任务已启动', + payload={'current_stage': 'scan'} + ) + self.task_stream.broadcast_event( + task_id, + 'stage.started', + 'scan', + {'stage': 'scan'} + ) + self._append_log( + task_id, + stage='scan', + level='info', + event_type='stage.started', + message=f'开始扫描目录: {normalized_config["input"]}', + payload={'path': normalized_config['input']} + ) + + try: + self._quarantine_exception_items(task_id, normalized_config, scope='history') + scan_stats = self.scanner.scan( + normalized_config['input'], + on_item=lambda item: self._handle_scan_item(task_id, item), + on_progress=lambda next_stats: self._handle_scan_progress( + task_id, + next_stats, + current_stats + ), + on_log=lambda level, message, payload: self._handle_scanner_log( + task_id, + level, + message, + payload + ) + ) + + current_stats['scan'] = scan_stats.copy() + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='scan', + stage_states=create_scan_completed_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.completed', + 'scan', + {'stats': current_stats} + ) + self._append_log( + task_id, + stage='scan', + level='success', + event_type='stage.completed', + message='扫描阶段完成', + payload={'stats': current_stats} + ) + + failure_stage = 'preprocess' + self._run_preprocess_stage(task_id, current_stats) + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='preprocess', + stage_states=create_preprocess_completed_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.completed', + 'preprocess', + {'stats': current_stats} + ) + self._append_log( + task_id, + stage='preprocess', + level='success', + event_type='stage.completed', + message='音频预处理阶段完成', + payload={'stats': current_stats} + ) + + failure_stage = 'match' + self._run_match_stage(task_id, current_stats, normalized_config) + + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='match', + stage_states=create_match_completed_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.completed', + 'match', + {'stats': current_stats} + ) + self._append_log( + task_id, + stage='match', + level='success', + event_type='stage.completed', + message='音乐匹配阶段完成', + payload={'stats': current_stats} + ) + + failure_stage = 'dedupe' + self._run_dedupe_stage(task_id, current_stats, normalized_config) + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='dedupe', + stage_states=create_dedupe_completed_stage_states(), + stats=current_stats, + ) + self.task_stream.broadcast_event( + task_id, + 'stage.completed', + 'dedupe', + {'stats': current_stats} + ) + self._append_log( + task_id, + stage='dedupe', + level='success', + event_type='stage.completed', + message='重复检测阶段完成', + payload={'stats': current_stats} + ) + + failure_stage = 'organize' + self._run_organize_stage(task_id, current_stats, normalized_config) + self._quarantine_exception_items(task_id, normalized_config, scope='current') + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='organize', + stage_states=create_organize_completed_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.completed', + 'organize', + {'stats': current_stats} + ) + self._append_log( + task_id, + stage='organize', + level='success', + event_type='stage.completed', + message='整理入库阶段完成', + payload={'stats': current_stats} + ) + + completed_at = current_timestamp() + completed_stage_states = create_task_completed_stage_states() + self.task_store.update_task( + task_id, + status=TASK_STATUS_COMPLETED, + current_stage='complete', + stage_states=completed_stage_states, + stats=current_stats, + completed_at=completed_at + ) + self.task_stream.broadcast_event( + task_id, + 'task.completed', + 'complete', + {'status': TASK_STATUS_COMPLETED, 'stats': current_stats} + ) + self._append_log( + task_id, + stage='complete', + level='success', + event_type='task.completed', + message='任务已完成,五个阶段均已执行结束', + payload={'stats': current_stats} + ) + except Exception as error: + if failure_stage == 'scan': + failed_stage_states = create_scan_failed_stage_states() + failed_stage = 'scan' + elif failure_stage == 'preprocess': + failed_stage_states = create_preprocess_failed_stage_states() + failed_stage = 'preprocess' + elif failure_stage == 'match': + failed_stage_states = create_match_failed_stage_states() + failed_stage = 'match' + elif failure_stage == 'dedupe': + failed_stage_states = create_dedupe_failed_stage_states() + failed_stage = 'dedupe' + else: + failed_stage_states = create_organize_failed_stage_states() + failed_stage = 'organize' + completed_at = current_timestamp() + self.task_store.update_task( + task_id, + status=TASK_STATUS_FAILED, + current_stage=failed_stage, + stage_states=failed_stage_states, + stats=current_stats, + error_message=str(error), + completed_at=completed_at + ) + self.task_stream.broadcast_event( + task_id, + 'task.failed', + failed_stage, + {'status': TASK_STATUS_FAILED, 'error_message': str(error), 'stats': current_stats} + ) + self._append_log( + task_id, + stage=failed_stage, + level='error', + event_type='task.failed', + message=f'任务失败: {error}', + payload={'error_message': str(error), 'stats': current_stats} + ) + + def _run_preprocess_stage(self, task_id: str, current_stats: dict): + preprocess_stats = current_stats['preprocess'].copy() + initial_candidates = self.task_store.list_preprocess_candidate_items(task_id) + preprocess_stats['input_items'] = len(initial_candidates) + current_stats['preprocess'] = preprocess_stats.copy() + + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='preprocess', + stage_states=create_preprocess_running_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.started', + 'preprocess', + {'stage': 'preprocess'} + ) + self._append_log( + task_id, + stage='preprocess', + level='info', + event_type='stage.started', + message='开始执行音频预处理阶段', + payload={'input_items': preprocess_stats['input_items']} + ) + + if not initial_candidates: + self._persist_preprocess_progress(task_id, current_stats, preprocess_stats) + return + + dependencies = self.preprocessor.check_dependencies() + workspace = self.preprocessor.create_workspace(task_id) + self.task_stream.broadcast_event( + task_id, + 'preprocess.dependencies_checked', + 'preprocess', + { + 'dependencies': dependencies, + 'workspace': {name: str(path) for name, path in workspace.items()} + } + ) + self._append_log( + task_id, + stage='preprocess', + level='info', + event_type='preprocess.dependencies_checked', + message='预处理依赖检查通过', + payload={ + 'dependencies': dependencies, + 'workspace': {name: str(path) for name, path in workspace.items()} + } + ) + + processed_count = 0 + last_progress_at = time.monotonic() + + while True: + candidates = self.task_store.list_preprocess_candidate_items(task_id) + if not candidates: + break + + item = self.task_store.update_task_item( + candidates[0]['id'], + preprocess_status='running', + preprocess_reason=None, + preprocess_message=None + ) + + try: + self._process_preprocess_item(task_id, item, preprocess_stats) + except PreprocessItemError as error: + preprocess_stats['failed_items'] += 1 + failed_item = self.task_store.update_task_item( + item['id'], + preprocess_status='failed', + preprocess_reason=error.reason, + preprocess_message=error.message + ) + self.task_stream.broadcast_event( + task_id, + 'preprocess.item_failed', + 'preprocess', + {'item': failed_item} + ) + self._append_log( + task_id, + stage='preprocess', + level='error', + event_type='preprocess.item_failed', + message=f'预处理失败: {item["relative_path"]}', + payload={'item': failed_item} + ) + except Exception as error: + preprocess_stats['failed_items'] += 1 + failed_item = self.task_store.update_task_item( + item['id'], + preprocess_status='failed', + preprocess_reason='unexpected_error', + preprocess_message=str(error) + ) + self.task_stream.broadcast_event( + task_id, + 'preprocess.item_failed', + 'preprocess', + {'item': failed_item} + ) + self._append_log( + task_id, + stage='preprocess', + level='error', + event_type='preprocess.item_failed', + message=f'预处理异常: {item["relative_path"]}', + payload={'item': failed_item} + ) + + processed_count += 1 + now = time.monotonic() + if ( + processed_count % PREPROCESS_PROGRESS_BATCH_SIZE == 0 + or now - last_progress_at >= PREPROCESS_PROGRESS_INTERVAL_SECONDS + ): + self._persist_preprocess_progress(task_id, current_stats, preprocess_stats) + last_progress_at = now + + self._persist_preprocess_progress(task_id, current_stats, preprocess_stats) + + def _run_match_stage(self, task_id: str, current_stats: dict, config_snapshot: dict): + match_stats = current_stats['match'].copy() + initial_candidates = self.task_store.list_match_candidate_items(task_id) + album_groups = _build_album_groups(initial_candidates) + match_stats['input_items'] = len(initial_candidates) + current_stats['match'] = match_stats.copy() + + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='match', + stage_states=create_match_running_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.started', + 'match', + {'stage': 'match'} + ) + self._append_log( + task_id, + stage='match', + level='info', + event_type='stage.started', + message='开始执行音乐匹配阶段', + payload={'input_items': match_stats['input_items']} + ) + + if not initial_candidates: + self._persist_match_progress(task_id, current_stats, match_stats) + return + + processed_count = 0 + last_progress_at = time.monotonic() + + while True: + candidates = self.task_store.list_match_candidate_items(task_id) + if not candidates: + break + + item = self.task_store.update_task_item( + candidates[0]['id'], + match_status='running', + match_reason=None, + match_message=None, + match_source=None, + match_confidence=None, + match_is_authoritative=0, + matched_metadata_json=None, + match_candidates_json=None, + match_enrichment_json=None + ) + item_group = album_groups.get(_album_group_key(item), [item]) + self.task_stream.broadcast_event( + task_id, + 'match.lookup_started', + 'match', + {'item': item} + ) + self._append_log( + task_id, + stage='match', + level='info', + event_type='match.lookup_started', + message=f'开始匹配: {item["relative_path"]}', + payload={'item': item} + ) + + try: + match_result = self.matcher.match_item(item, item_group, config_snapshot) + item = self.task_store.update_task_item( + item['id'], + match_status=match_result['status'], + match_reason=match_result['reason'], + match_message=match_result['message'], + match_source=match_result['source'], + match_confidence=match_result['confidence'], + match_is_authoritative=1 if match_result['is_authoritative'] else 0, + matched_metadata_json=match_result['matched_metadata_json'], + match_candidates_json=match_result['match_candidates_json'], + match_enrichment_json=match_result['match_enrichment_json'] + ) + self._handle_provider_warnings( + task_id, + item, + match_result.get('provider_warnings') or [], + match_stats + ) + self.task_stream.broadcast_event( + task_id, + 'match.candidates_found', + 'match', + { + 'item': item, + 'candidates': match_result['match_candidates_json'] + } + ) + self._append_log( + task_id, + stage='match', + level='info', + event_type='match.candidates_found', + message=( + f'匹配候选已生成: {item["relative_path"]} ' + f'({len(match_result["match_candidates_json"])} 个)' + ), + payload={ + 'item': item, + 'candidates': match_result['match_candidates_json'] + } + ) + self._handle_match_result(task_id, item, match_result, match_stats) + except MatchProviderError as error: + match_stats['failed_items'] += 1 + failed_item = self.task_store.update_task_item( + item['id'], + match_status='failed', + match_reason='provider_error', + match_message=str(error), + match_source=getattr(error, 'provider', None), + match_confidence=None, + match_is_authoritative=0, + matched_metadata_json=None, + match_candidates_json=None, + match_enrichment_json=None + ) + self.task_stream.broadcast_event( + task_id, + 'match.item_failed', + 'match', + {'item': failed_item} + ) + self._append_log( + task_id, + stage='match', + level='error', + event_type='match.item_failed', + message=f'匹配失败: {item["relative_path"]}', + payload={'item': failed_item} + ) + except Exception as error: + match_stats['failed_items'] += 1 + failed_item = self.task_store.update_task_item( + item['id'], + match_status='failed', + match_reason='unexpected_error', + match_message=str(error), + match_source=None, + match_confidence=None, + match_is_authoritative=0, + matched_metadata_json=None, + match_candidates_json=None, + match_enrichment_json=None + ) + self.task_stream.broadcast_event( + task_id, + 'match.item_failed', + 'match', + {'item': failed_item} + ) + self._append_log( + task_id, + stage='match', + level='error', + event_type='match.item_failed', + message=f'匹配异常: {item["relative_path"]}', + payload={'item': failed_item} + ) + + processed_count += 1 + now = time.monotonic() + if ( + processed_count % MATCH_PROGRESS_BATCH_SIZE == 0 + or now - last_progress_at >= MATCH_PROGRESS_INTERVAL_SECONDS + ): + self._persist_match_progress(task_id, current_stats, match_stats) + last_progress_at = now + + self._persist_match_progress(task_id, current_stats, match_stats) + + def _run_dedupe_stage(self, task_id: str, current_stats: dict, config_snapshot: dict): + dedupe_stats = current_stats['dedupe'].copy() + initial_candidates = self.task_store.list_dedupe_candidate_items(task_id) + dedupe_stats['input_items'] = len(initial_candidates) + current_stats['dedupe'] = dedupe_stats.copy() + + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='dedupe', + stage_states=create_dedupe_running_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.started', + 'dedupe', + {'stage': 'dedupe'} + ) + self._append_log( + task_id, + stage='dedupe', + level='info', + event_type='stage.started', + message='开始执行重复检测阶段', + payload={'input_items': dedupe_stats['input_items']} + ) + + self.dedupe_runner.run(task_id, current_stats, config_snapshot) + + def _run_organize_stage(self, task_id: str, current_stats: dict, config_snapshot: dict): + organize_stats = current_stats['organize'].copy() + initial_candidates = self.task_store.list_organize_candidate_items(task_id) + organize_stats['input_items'] = len(initial_candidates) + current_stats['organize'] = organize_stats.copy() + + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='organize', + stage_states=create_organize_running_stage_states(), + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'stage.started', + 'organize', + {'stage': 'organize'} + ) + self._append_log( + task_id, + stage='organize', + level='info', + event_type='stage.started', + message='开始执行整理入库阶段', + payload={'input_items': organize_stats['input_items']} + ) + + self.organize_runner.run(task_id, current_stats, config_snapshot) + + def _quarantine_exception_items(self, task_id: str, config_snapshot: dict, *, scope: str): + input_root = Path(config_snapshot['input']).expanduser().resolve(strict=False) + trash_root = Path(config_snapshot['trash']).expanduser().resolve(strict=False) + if scope == 'current': + source_items = self.task_store.list_all_task_items(task_id) + else: + source_items = self.task_store.list_exception_source_items('open') + + for item in source_items: + exception_type = _exception_type_for_item(item) + if exception_type is None: + continue + if item.get('exception_resolution_status') not in {'open', 'planned'}: + continue + if item.get('trash_file_path'): + self._log_quarantine_skip( + task_id, + item, + exception_type, + 'already_trashed', + '异常文件已存在回收站路径,跳过隔离' + ) + continue + + try: + result = self._quarantine_exception_item( + item, + exception_type, + input_root, + trash_root + ) + except OSError as error: + self._append_log( + task_id, + stage='system', + level='error', + event_type='exception.quarantine_skipped', + message=f'异常文件隔离失败: {item["relative_path"]}', + payload={ + 'item_id': item['id'], + 'source_task_id': item['task_id'], + 'exception_type': exception_type, + 'reason': 'move_failed', + 'error_message': str(error) + } + ) + continue + + if result is None: + self._log_quarantine_skip( + task_id, + item, + exception_type, + 'no_input_file', + '异常项没有残留在输入目录的可隔离文件' + ) + continue + + updated_item = self.task_store.update_task_item( + item['id'], + is_active=0, + current_file_path=result['current_file_path'], + trash_file_path=result['trash_file_path'], + exception_resolution_status=item.get('exception_resolution_status') or 'open' + ) + self.task_stream.broadcast_event( + task_id, + 'exception.item_quarantined', + 'system', + {'item': updated_item, 'quarantine': result} + ) + self._append_log( + task_id, + stage='system', + level='warning', + event_type='exception.item_quarantined', + message=f'已隔离异常文件: {item["relative_path"]}', + payload={ + 'item_id': item['id'], + 'source_task_id': item['task_id'], + 'exception_type': exception_type, + **result + } + ) + if result.get('errors'): + self._append_log( + task_id, + stage='system', + level='error', + event_type='exception.quarantine_skipped', + message=f'部分异常文件隔离失败: {item["relative_path"]}', + payload={ + 'item_id': item['id'], + 'source_task_id': item['task_id'], + 'exception_type': exception_type, + 'reason': 'partial_move_failed', + 'errors': result['errors'] + } + ) + + def _quarantine_exception_item( + self, + item: dict, + exception_type: str, + input_root: Path, + trash_root: Path + ) -> dict | None: + current_path = _existing_file_path(item.get('current_file_path')) + original_path = _existing_file_path(item.get('original_path')) + current_in_input = _is_relative_to(current_path, input_root) if current_path else False + original_in_input = _is_relative_to(original_path, input_root) if original_path else False + + if not current_in_input and not original_in_input: + return None + + destination_dir = trash_root / exception_type / item['task_id'] + moved_paths: dict[str, str] = {} + errors: list[dict[str, str]] = [] + seen_sources: set[Path] = set() + + for role, source_path in (('current', current_path), ('original', original_path)): + if source_path is None: + continue + resolved_source = source_path.resolve(strict=False) + if resolved_source in seen_sources: + continue + if role == 'current' or _is_relative_to(source_path, input_root): + destination = _build_unique_destination( + destination_dir, + _build_prefixed_name(item['id'], source_path.name) + ) + destination.parent.mkdir(parents=True, exist_ok=True) + try: + shutil.move(str(source_path), str(destination)) + moved_paths[role] = str(destination.resolve(strict=False)) + except OSError as error: + errors.append( + { + 'role': role, + 'source_path': str(source_path), + 'destination_path': str(destination), + 'error_message': str(error) + } + ) + continue + seen_sources.add(resolved_source) + + current_file_path = moved_paths.get('current') + if current_file_path is None and current_path is not None and current_path.exists(): + current_file_path = str(current_path.resolve(strict=False)) + current_file_path = current_file_path or moved_paths.get('original') or item.get('current_file_path') + trash_file_path = moved_paths.get('current') or moved_paths.get('original') + + if trash_file_path is None: + if errors: + raise OSError(f'failed to move exception files: {errors}') + return None + + return { + 'current_file_path': current_file_path, + 'trash_file_path': trash_file_path, + 'moved_paths': moved_paths, + 'errors': errors + } + + def _log_quarantine_skip( + self, + task_id: str, + item: dict, + exception_type: str, + reason: str, + message: str + ): + self._append_log( + task_id, + stage='system', + level='info', + event_type='exception.quarantine_skipped', + message=message, + payload={ + 'item_id': item['id'], + 'source_task_id': item['task_id'], + 'exception_type': exception_type, + 'reason': reason + } + ) + + def _handle_provider_warnings( + self, + task_id: str, + item: dict, + provider_warnings: list[dict], + match_stats: dict[str, int] + ): + if not provider_warnings: + return + + match_stats['provider_warnings'] += len(provider_warnings) + for provider_warning in provider_warnings: + provider_name = provider_warning.get('provider') or 'unknown' + warning_message = provider_warning.get('message') or f'{provider_name} 请求失败' + payload = { + 'item': item, + 'provider_warning': provider_warning + } + self.task_stream.broadcast_event( + task_id, + 'match.provider_skipped', + 'match', + payload + ) + self._append_log( + task_id, + stage='match', + level='warning', + event_type='match.provider_skipped', + message=f'已跳过 {provider_name}: {warning_message}', + payload=payload + ) + + def _handle_match_result( + self, + task_id: str, + item: dict, + match_result: dict, + match_stats: dict[str, int] + ): + if match_result['status'] == 'matched': + match_stats['matched_authoritative'] += 1 + event_type = 'match.item_matched' + level = 'success' + message = ( + f'权威匹配成功: {item["relative_path"]} ' + f'({match_result["source"]}, {match_result["confidence"]:.1f})' + ) + elif match_result['status'] == 'matched_fallback': + match_stats['matched_fallback'] += 1 + event_type = 'match.item_matched' + level = 'warning' + message = ( + f'Fallback 匹配成功: {item["relative_path"]} ' + f'({match_result["source"]}, {match_result["confidence"]:.1f})' + ) + elif match_result['status'] == 'low_score': + match_stats['low_score'] += 1 + event_type = 'match.item_low_score' + level = 'warning' + message = f'候选分数不足: {item["relative_path"]}' + else: + match_stats['not_found'] += 1 + event_type = 'match.item_not_found' + level = 'warning' + message = f'未找到匹配候选: {item["relative_path"]}' + + self.task_stream.broadcast_event( + task_id, + event_type, + 'match', + {'item': item} + ) + self._append_log( + task_id, + stage='match', + level=level, + event_type=event_type, + message=message, + payload={'item': item} + ) + + def _process_preprocess_item( + self, + task_id: str, + item: dict, + preprocess_stats: dict[str, int] + ): + current_file_path = item['current_file_path'] or item['original_path'] + item_paths = build_preprocess_paths(task_id, item['id']) + + if cue_path := self.preprocessor.find_matching_cue(current_file_path): + self._split_cue_item( + task_id, + item, + cue_path, + current_file_path, + item_paths, + preprocess_stats + ) + return + + artifacts = dict(item.get('preprocess_artifacts_json') or {}) + original_tags = dict(item.get('original_tags_json') or {}) + local_cover = item.get('local_cover') + converted_path = current_file_path + + if Path(current_file_path).suffix.lower() in FORCED_FLAC_EXTENSIONS: + converted_path = self.preprocessor.convert_to_flac( + current_file_path, + item_paths['converted'] + ) + artifacts['converted_path'] = converted_path + preprocess_stats['converted_items'] += 1 + converted_file = Path(converted_path) + converted_item = self.task_store.update_task_item( + item['id'], + current_file_path=converted_path, + filename=converted_file.name, + extension=converted_file.suffix.lower(), + preprocess_artifacts_json=artifacts + ) + self.task_stream.broadcast_event( + task_id, + 'preprocess.item_converted', + 'preprocess', + {'item': converted_item} + ) + self._append_log( + task_id, + stage='preprocess', + level='info', + event_type='preprocess.item_converted', + message=f'已转码为 FLAC: {item["relative_path"]}', + payload={'item': converted_item} + ) + + audio_props = self.preprocessor.probe_audio(converted_path) + warnings: list[str] = [] + warning_reasons: list[str] = [] + + try: + extracted_tags = self.preprocessor.read_tags(converted_path) + original_tags = merge_tag_snapshots(extracted_tags, original_tags) + except Exception as error: + warning_reasons.append('metadata_failed') + warnings.append(f'读取元数据失败: {error}') + + if not local_cover: + try: + embedded_cover = self.preprocessor.extract_embedded_cover( + converted_path, + item_paths['cover'] + ) + if embedded_cover: + artifacts['embedded_cover'] = embedded_cover + local_cover = embedded_cover + except Exception as error: + warning_reasons.append('metadata_failed') + warnings.append(f'提取内嵌封面失败: {error}') + + fingerprint = None + fingerprint_duration = None + try: + fingerprint_payload = self.preprocessor.calculate_fingerprint(converted_path) + fingerprint = fingerprint_payload['fingerprint'] + fingerprint_duration = fingerprint_payload['duration_seconds'] + preprocess_stats['fingerprints_ok'] += 1 + except PreprocessItemError as error: + warning_reasons.append(error.reason) + warnings.append(error.message) + preprocess_stats['fingerprints_failed'] += 1 + + preprocess_stats['metadata_snapshots'] += 1 + preprocess_stats['output_items'] += 1 + + update_fields = { + 'current_file_path': converted_path, + 'local_cover': local_cover, + 'audio_props_json': audio_props, + 'original_tags_json': original_tags, + 'preprocess_artifacts_json': artifacts or None, + 'acoustic_fingerprint': fingerprint, + 'fingerprint_duration_seconds': fingerprint_duration + } + + if warnings: + preprocess_stats['warning_items'] += 1 + update_fields.update( + preprocess_status='warning', + preprocess_reason=','.join(_unique_strings(warning_reasons)), + preprocess_message=';'.join(_unique_strings(warnings)) + ) + final_item = self.task_store.update_task_item(item['id'], **update_fields) + self.task_stream.broadcast_event( + task_id, + 'preprocess.item_warning', + 'preprocess', + {'item': final_item} + ) + self._append_log( + task_id, + stage='preprocess', + level='warning', + event_type='preprocess.item_warning', + message=f'预处理完成但存在警告: {item["relative_path"]}', + payload={'item': final_item} + ) + return + + update_fields.update( + preprocess_status='completed', + preprocess_reason=None, + preprocess_message=None + ) + final_item = self.task_store.update_task_item(item['id'], **update_fields) + self.task_stream.broadcast_event( + task_id, + 'preprocess.item_completed', + 'preprocess', + {'item': final_item} + ) + self._append_log( + task_id, + stage='preprocess', + level='success', + event_type='preprocess.item_completed', + message=f'预处理完成: {item["relative_path"]}', + payload={'item': final_item} + ) + + def _split_cue_item( + self, + task_id: str, + item: dict, + cue_path: Path, + current_file_path: str, + item_paths: dict[str, Path], + preprocess_stats: dict[str, int] + ): + total_duration_seconds = None + try: + total_duration_seconds = self.preprocessor.probe_audio(current_file_path).get( + 'duration_seconds' + ) + except PreprocessItemError: + total_duration_seconds = None + + cue_sheet = self.preprocessor.parse_cue(cue_path) + split_outputs = self.preprocessor.split_cue_tracks( + current_file_path, + cue_sheet, + item_paths['split'], + total_duration_seconds + ) + + child_items: list[dict] = [] + for split_output in split_outputs: + file_size, modified_at = _read_file_state(split_output['path']) + cue_tags = { + 'title': split_output['title'], + 'artist': split_output['artist'], + 'album': split_output['album'], + 'album_artist': split_output['album_artist'], + 'track_number': str(split_output['track_number']) + } + child_items.append( + self.task_store.insert_task_item( + task_id, + original_path=item['original_path'], + relative_path=build_split_child_relative_path( + item['relative_path'], + split_output['filename'] + ), + filename=split_output['filename'], + extension=Path(split_output['filename']).suffix.lower(), + size_bytes=file_size, + modified_at=modified_at, + local_cover=item['local_cover'], + local_lyric=item['local_lyric'], + scan_status='queued', + scan_reason=None, + scan_message=None, + parent_item_id=item['id'], + current_file_path=split_output['path'], + preprocess_status='pending', + original_tags_json=merge_tag_snapshots( + cue_tags, + item.get('original_tags_json') or {} + ), + preprocess_artifacts_json={'cue_path': str(cue_path)} + ) + ) + + parent_artifacts = dict(item.get('preprocess_artifacts_json') or {}) + parent_artifacts.update( + { + 'cue_path': str(cue_path), + 'split_outputs': [child['current_file_path'] for child in child_items] + } + ) + parent_item = self.task_store.update_task_item( + item['id'], + is_active=0, + preprocess_status='replaced_by_split', + preprocess_reason='cue_split', + preprocess_message=f'已按 CUE 切分为 {len(child_items)} 个子轨道', + preprocess_artifacts_json=parent_artifacts + ) + + preprocess_stats['split_parents'] += 1 + preprocess_stats['generated_children'] += len(child_items) + self.task_stream.broadcast_event( + task_id, + 'preprocess.item_split', + 'preprocess', + {'items': [parent_item, *child_items]} + ) + self._append_log( + task_id, + stage='preprocess', + level='info', + event_type='preprocess.item_split', + message=f'已根据 CUE 切轨: {item["relative_path"]}', + payload={ + 'item': parent_item, + 'children': child_items + } + ) + + def _handle_scan_item(self, task_id: str, item: ScanItem): + persisted_item = self.task_store.insert_task_item( + task_id, + **item.to_dict(), + preprocess_status='pending' if item.scan_status == 'queued' else 'skipped' + ) + + if item.scan_status == 'queued': + message = f'文件已加入扫描结果: {item.relative_path}' + event_type = 'scan.file_queued' + level = 'info' + elif item.scan_status == 'skipped_locked': + message = f'跳过最近仍在写入的文件: {item.relative_path}' + event_type = 'scan.file_skipped' + level = 'warning' + else: + message = f'文件无法处理: {item.relative_path}' + event_type = 'scan.file_skipped' + level = 'error' + + self.task_stream.broadcast_event( + task_id, + event_type, + 'scan', + {'item': persisted_item} + ) + self._append_log( + task_id, + stage='scan', + level=level, + event_type=event_type, + message=message, + payload={'item': persisted_item} + ) + + def _handle_scan_progress( + self, + task_id: str, + scan_stats: dict[str, int], + current_stats: dict + ): + current_stats['scan'] = scan_stats.copy() + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='scan', + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'scan.progress', + 'scan', + {'stats': current_stats} + ) + + def _persist_preprocess_progress( + self, + task_id: str, + current_stats: dict, + preprocess_stats: dict[str, int] + ): + current_stats['preprocess'] = preprocess_stats.copy() + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='preprocess', + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'preprocess.progress', + 'preprocess', + {'stats': current_stats} + ) + + def _persist_match_progress( + self, + task_id: str, + current_stats: dict, + match_stats: dict[str, int] + ): + current_stats['match'] = match_stats.copy() + self.task_store.update_task( + task_id, + status=TASK_STATUS_RUNNING, + current_stage='match', + stats=current_stats + ) + self.task_stream.broadcast_event( + task_id, + 'match.progress', + 'match', + {'stats': current_stats} + ) + + def _handle_scanner_log( + self, + task_id: str, + level: str, + message: str, + payload: dict | None + ): + self._append_log( + task_id, + stage='scan', + level=level, + event_type='log.appended', + message=message, + payload=payload + ) + + def _append_log( + self, + task_id: str, + *, + stage: str, + level: str, + event_type: str, + message: str, + payload: dict | None = None + ): + persisted_log = self.task_store.append_log( + task_id, + stage, + level, + event_type, + message, + payload + ) + self.task_stream.broadcast_event( + task_id, + 'log.appended', + stage, + {'log': persisted_log} + ) + + +def _read_file_state(file_path: str) -> tuple[int, str]: + file_stat = Path(file_path).stat() + return file_stat.st_size, _format_file_timestamp(file_stat.st_mtime) + + +def _format_file_timestamp(value: float) -> str: + return ( + datetime.fromtimestamp(value, tz=timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace('+00:00', 'Z') + ) + + +def _unique_strings(values: list[str]) -> list[str]: + return list(dict.fromkeys(values)) + + +def _existing_file_path(value: str | None) -> Path | None: + if not value: + return None + path = Path(value).expanduser() + return path if path.exists() and path.is_file() else None + + +def _is_relative_to(path: Path | None, root: Path) -> bool: + if path is None: + return False + try: + path.resolve(strict=False).relative_to(root) + return True + except ValueError: + return False + + +def _exception_type_for_item(item: dict) -> str | None: + if item.get('organize_status') in {'trashed', 'failed'}: + return 'organize_failed' + if item.get('dedupe_status') in {'duplicate_trashed', 'failed'}: + return 'duplicates' + if item.get('match_status') == 'low_score': + return 'low_score' + if item.get('match_status') in {'failed', 'not_found'}: + return 'match_failed' + if ( + item.get('preprocess_status') == 'failed' + and item.get('preprocess_reason') == 'convert_failed' + ): + return 'convert_failed' + preprocess_reason = item.get('preprocess_reason') or '' + if item.get('preprocess_status') == 'warning' and 'metadata_failed' in preprocess_reason: + return 'missing_tags' + return None + + +def _album_group_key(item: dict) -> str: + current_file_path = item.get('current_file_path') or item.get('original_path') or '' + if current_file_path: + return str(Path(current_file_path).parent) + relative_path = item.get('relative_path') or '' + return str(Path(relative_path).parent) + + +def _build_album_groups(items: list[dict]) -> dict[str, list[dict]]: + album_groups: dict[str, list[dict]] = {} + for item in items: + album_groups.setdefault(_album_group_key(item), []).append(item) + return album_groups diff --git a/backend/app/task_store.py b/backend/app/task_store.py new file mode 100644 index 0000000..c8a5d3e --- /dev/null +++ b/backend/app/task_store.py @@ -0,0 +1,1209 @@ +import json +import sqlite3 +from pathlib import Path +from uuid import uuid4 + +from .task_constants import ( + ACTIVE_TASK_STATUSES, + SCAN_PROGRESS_LOG_LIMIT, + STAGE_STATUS_FAILED, + TASK_STATUS_COMPLETED, + TASK_STATUS_FAILED, + TASK_STATUS_PENDING, + TASK_TYPE_INGEST, + current_timestamp, + create_empty_dedupe_stats, + create_empty_match_stats, + create_empty_organize_stats, + create_empty_preprocess_stats, + create_empty_repair_stats, + create_empty_scan_stats, + create_empty_task_stats, + create_pending_repair_stage_states, + create_pending_stage_states +) + + +class TaskConflictError(Exception): + def __init__(self, active_task_id: str): + super().__init__('Task already running') + self.active_task_id = active_task_id + + +class TaskNotFoundError(Exception): + pass + + +class TaskStore: + def __init__(self, db_path: Path): + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._initialize() + + def _connect(self): + connection = sqlite3.connect(self.db_path, check_same_thread=False) + connection.row_factory = sqlite3.Row + connection.execute('PRAGMA foreign_keys = ON') + return connection + + def _initialize(self): + with self._connect() as connection: + connection.executescript( + ''' + CREATE TABLE IF NOT EXISTS task_runs ( + id TEXT PRIMARY KEY, + task_type TEXT NOT NULL DEFAULT 'ingest', + trigger_source TEXT NOT NULL, + source_task_id TEXT, + status TEXT NOT NULL, + current_stage TEXT NOT NULL, + stage_states_json TEXT NOT NULL, + config_snapshot_json TEXT NOT NULL, + stats_json TEXT NOT NULL, + repair_plan_json TEXT, + error_message TEXT, + started_at TEXT NOT NULL, + completed_at TEXT, + updated_at TEXT NOT NULL + ); + + CREATE INDEX IF NOT EXISTS idx_task_runs_status_updated + ON task_runs (status, updated_at DESC); + + CREATE TABLE IF NOT EXISTS task_items ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id TEXT NOT NULL, + original_path TEXT NOT NULL, + relative_path TEXT NOT NULL, + filename TEXT NOT NULL, + extension TEXT NOT NULL, + size_bytes INTEGER, + modified_at TEXT, + local_cover TEXT, + local_lyric TEXT, + scan_status TEXT NOT NULL, + scan_reason TEXT, + scan_message TEXT, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY (task_id) REFERENCES task_runs(id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS task_logs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id TEXT NOT NULL, + stage TEXT NOT NULL, + level TEXT NOT NULL, + event_type TEXT NOT NULL, + message TEXT NOT NULL, + payload_json TEXT, + created_at TEXT NOT NULL, + FOREIGN KEY (task_id) REFERENCES task_runs(id) ON DELETE CASCADE + ); + ''' + ) + self._ensure_task_items_columns(connection) + self._ensure_indexes(connection) + connection.commit() + + def _ensure_task_items_columns(self, connection: sqlite3.Connection): + existing_columns = { + row['name'] + for row in connection.execute("PRAGMA table_info('task_items')").fetchall() + } + + column_specs = { + 'parent_item_id': 'INTEGER', + 'is_active': 'INTEGER NOT NULL DEFAULT 1', + 'current_file_path': 'TEXT', + 'preprocess_status': "TEXT NOT NULL DEFAULT 'pending'", + 'preprocess_reason': 'TEXT', + 'preprocess_message': 'TEXT', + 'audio_props_json': 'TEXT', + 'original_tags_json': 'TEXT', + 'preprocess_artifacts_json': 'TEXT', + 'acoustic_fingerprint': 'TEXT', + 'fingerprint_duration_seconds': 'REAL', + 'match_status': "TEXT NOT NULL DEFAULT 'pending'", + 'match_reason': 'TEXT', + 'match_message': 'TEXT', + 'match_source': 'TEXT', + 'match_confidence': 'REAL', + 'match_is_authoritative': 'INTEGER NOT NULL DEFAULT 0', + 'matched_metadata_json': 'TEXT', + 'match_candidates_json': 'TEXT', + 'match_enrichment_json': 'TEXT', + 'dedupe_status': "TEXT NOT NULL DEFAULT 'pending'", + 'dedupe_reason': 'TEXT', + 'dedupe_message': 'TEXT', + 'dedupe_group_key': 'TEXT', + 'duplicate_of_path': 'TEXT', + 'duplicate_of_item_id': 'INTEGER', + 'dedupe_decision_json': 'TEXT', + 'organize_status': "TEXT NOT NULL DEFAULT 'pending'", + 'organize_reason': 'TEXT', + 'organize_message': 'TEXT', + 'library_relative_path': 'TEXT', + 'library_file_path': 'TEXT', + 'trash_file_path': 'TEXT', + 'organize_decision_json': 'TEXT', + 'exception_resolution_status': "TEXT NOT NULL DEFAULT 'open'", + 'exception_resolution_json': 'TEXT', + 'last_repair_task_id': 'TEXT' + } + + task_run_columns = { + row['name'] + for row in connection.execute("PRAGMA table_info('task_runs')").fetchall() + } + + task_run_specs = { + 'task_type': "TEXT NOT NULL DEFAULT 'ingest'", + 'source_task_id': 'TEXT', + 'repair_plan_json': 'TEXT' + } + + for column_name, column_sql in task_run_specs.items(): + if column_name not in task_run_columns: + connection.execute( + f'ALTER TABLE task_runs ADD COLUMN {column_name} {column_sql}' + ) + + for column_name, column_sql in column_specs.items(): + if column_name not in existing_columns: + connection.execute( + f'ALTER TABLE task_items ADD COLUMN {column_name} {column_sql}' + ) + + connection.execute( + ''' + UPDATE task_items + SET current_file_path = original_path + WHERE current_file_path IS NULL + ''' + ) + + def _ensure_indexes(self, connection: sqlite3.Connection): + connection.executescript( + ''' + CREATE INDEX IF NOT EXISTS idx_task_items_task_created + ON task_items (task_id, created_at ASC); + + CREATE INDEX IF NOT EXISTS idx_task_items_task_status + ON task_items (task_id, scan_status); + + CREATE INDEX IF NOT EXISTS idx_task_items_task_preprocess + ON task_items (task_id, preprocess_status, is_active); + + CREATE INDEX IF NOT EXISTS idx_task_items_task_match + ON task_items (task_id, match_status, is_active); + + CREATE INDEX IF NOT EXISTS idx_task_items_task_dedupe + ON task_items (task_id, dedupe_status, is_active); + + CREATE INDEX IF NOT EXISTS idx_task_items_task_organize + ON task_items (task_id, organize_status, is_active); + + CREATE INDEX IF NOT EXISTS idx_task_logs_task_id + ON task_logs (task_id, id ASC); + ''' + ) + + def fail_stale_active_tasks(self) -> list[str]: + failed_task_ids: list[str] = [] + timestamp = current_timestamp() + + with self._connect() as connection: + connection.isolation_level = None + connection.execute('BEGIN IMMEDIATE') + stale_rows = connection.execute( + ''' + SELECT id, stage_states_json + FROM task_runs + WHERE status IN (?, ?) + ''', + ACTIVE_TASK_STATUSES + ).fetchall() + + for row in stale_rows: + stage_states = json.loads(row['stage_states_json']) + repaired_stage_states = { + stage_id: ( + STAGE_STATUS_FAILED + if stage_status in ('pending', 'running') + else stage_status + ) + for stage_id, stage_status in stage_states.items() + } + connection.execute( + ''' + UPDATE task_runs + SET status = ?, stage_states_json = ?, error_message = ?, completed_at = ?, updated_at = ? + WHERE id = ? + ''', + ( + TASK_STATUS_FAILED, + json.dumps(repaired_stage_states), + 'Service restarted unexpectedly', + timestamp, + timestamp, + row['id'] + ) + ) + connection.execute( + ''' + INSERT INTO task_logs (task_id, stage, level, event_type, message, payload_json, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', + ( + row['id'], + 'system', + 'error', + 'task.failed', + 'Service restarted unexpectedly', + None, + timestamp + ) + ) + failed_task_ids.append(row['id']) + + connection.commit() + + return failed_task_ids + + def create_task_if_idle( + self, + config_snapshot: dict, + trigger_source: str = 'manual', + *, + task_type: str = TASK_TYPE_INGEST, + source_task_id: str | None = None, + repair_plan_json: dict | None = None + ) -> dict: + task_id = str(uuid4()) + timestamp = current_timestamp() + if task_type == TASK_TYPE_INGEST: + stage_states = create_pending_stage_states() + stats = create_empty_task_stats() + initial_stage = 'scan' + else: + stage_states = create_pending_repair_stage_states() + stats = create_empty_repair_stats() + initial_stage = 'prepare' + + with self._connect() as connection: + connection.isolation_level = None + connection.execute('BEGIN IMMEDIATE') + active_row = connection.execute( + ''' + SELECT id + FROM task_runs + WHERE status IN (?, ?) + AND task_type = ? + ORDER BY started_at DESC + LIMIT 1 + ''', + (*ACTIVE_TASK_STATUSES, task_type) + ).fetchone() + + if active_row is not None: + connection.rollback() + raise TaskConflictError(active_row['id']) + + connection.execute( + ''' + INSERT INTO task_runs ( + id, task_type, trigger_source, source_task_id, status, current_stage, stage_states_json, + config_snapshot_json, stats_json, repair_plan_json, error_message, started_at, completed_at, updated_at + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', + ( + task_id, + task_type, + trigger_source, + source_task_id, + TASK_STATUS_PENDING, + initial_stage, + json.dumps(stage_states), + json.dumps(config_snapshot), + json.dumps(stats), + json.dumps(repair_plan_json) if repair_plan_json is not None else None, + None, + timestamp, + None, + timestamp + ) + ) + connection.commit() + + return self.get_task(task_id) + + def get_active_task(self, task_type: str = TASK_TYPE_INGEST) -> dict | None: + with self._connect() as connection: + row = connection.execute( + ''' + SELECT * + FROM task_runs + WHERE status IN (?, ?) + AND task_type = ? + ORDER BY started_at DESC + LIMIT 1 + ''', + (*ACTIVE_TASK_STATUSES, task_type) + ).fetchone() + + return self._parse_task(row) if row is not None else None + + def get_latest_task(self, task_type: str | None = TASK_TYPE_INGEST) -> dict | None: + with self._connect() as connection: + if task_type is None: + row = connection.execute( + ''' + SELECT * + FROM task_runs + ORDER BY started_at DESC + LIMIT 1 + ''' + ).fetchone() + else: + row = connection.execute( + ''' + SELECT * + FROM task_runs + WHERE task_type = ? + ORDER BY started_at DESC + LIMIT 1 + ''', + (task_type,) + ).fetchone() + + return self._parse_task(row) if row is not None else None + + def get_task(self, task_id: str) -> dict: + with self._connect() as connection: + row = connection.execute( + 'SELECT * FROM task_runs WHERE id = ?', + (task_id,) + ).fetchone() + + if row is None: + raise TaskNotFoundError(task_id) + + return self._parse_task(row) + + def update_task( + self, + task_id: str, + *, + status: str | None = None, + current_stage: str | None = None, + stage_states: dict[str, str] | None = None, + stats: dict | None = None, + error_message: str | None = None, + completed_at: str | None = None + ) -> dict: + assignments: list[str] = [] + values: list[object] = [] + + if status is not None: + assignments.append('status = ?') + values.append(status) + if current_stage is not None: + assignments.append('current_stage = ?') + values.append(current_stage) + if stage_states is not None: + assignments.append('stage_states_json = ?') + values.append(json.dumps(stage_states)) + if stats is not None: + assignments.append('stats_json = ?') + values.append(json.dumps(stats)) + if error_message is not None or completed_at is not None: + assignments.append('error_message = ?') + values.append(error_message) + if completed_at is not None: + assignments.append('completed_at = ?') + values.append(completed_at) + + assignments.append('updated_at = ?') + values.append(current_timestamp()) + values.append(task_id) + + with self._connect() as connection: + connection.execute( + f'UPDATE task_runs SET {", ".join(assignments)} WHERE id = ?', + values + ) + connection.commit() + + return self.get_task(task_id) + + def append_log( + self, + task_id: str, + stage: str, + level: str, + event_type: str, + message: str, + payload: dict | None = None + ) -> dict: + timestamp = current_timestamp() + + with self._connect() as connection: + cursor = connection.execute( + ''' + INSERT INTO task_logs (task_id, stage, level, event_type, message, payload_json, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', + ( + task_id, + stage, + level, + event_type, + message, + json.dumps(payload) if payload is not None else None, + timestamp + ) + ) + connection.commit() + row = connection.execute( + 'SELECT * FROM task_logs WHERE id = ?', + (cursor.lastrowid,) + ).fetchone() + + return self._parse_log(row) + + def insert_task_item( + self, + task_id: str, + *, + original_path: str, + relative_path: str, + filename: str, + extension: str, + size_bytes: int | None, + modified_at: str | None, + local_cover: str | None, + local_lyric: str | None, + scan_status: str, + scan_reason: str | None, + scan_message: str | None, + parent_item_id: int | None = None, + is_active: int = 1, + current_file_path: str | None = None, + preprocess_status: str = 'pending', + preprocess_reason: str | None = None, + preprocess_message: str | None = None, + audio_props_json: dict | None = None, + original_tags_json: dict | None = None, + preprocess_artifacts_json: dict | None = None, + acoustic_fingerprint: str | None = None, + fingerprint_duration_seconds: float | None = None, + match_status: str = 'pending', + match_reason: str | None = None, + match_message: str | None = None, + match_source: str | None = None, + match_confidence: float | None = None, + match_is_authoritative: int = 0, + matched_metadata_json: dict | None = None, + match_candidates_json: list[dict] | None = None, + match_enrichment_json: dict | None = None, + dedupe_status: str = 'pending', + dedupe_reason: str | None = None, + dedupe_message: str | None = None, + dedupe_group_key: str | None = None, + duplicate_of_path: str | None = None, + duplicate_of_item_id: int | None = None, + dedupe_decision_json: dict | None = None, + organize_status: str = 'pending', + organize_reason: str | None = None, + organize_message: str | None = None, + library_relative_path: str | None = None, + library_file_path: str | None = None, + trash_file_path: str | None = None, + organize_decision_json: dict | None = None, + exception_resolution_status: str = 'open', + exception_resolution_json: dict | None = None, + last_repair_task_id: str | None = None + ) -> dict: + timestamp = current_timestamp() + current_file_path = current_file_path or original_path + + with self._connect() as connection: + cursor = connection.execute( + ''' + INSERT INTO task_items ( + task_id, parent_item_id, is_active, original_path, current_file_path, relative_path, + filename, extension, size_bytes, modified_at, local_cover, local_lyric, scan_status, + scan_reason, scan_message, preprocess_status, preprocess_reason, preprocess_message, + audio_props_json, original_tags_json, preprocess_artifacts_json, acoustic_fingerprint, + fingerprint_duration_seconds, match_status, match_reason, match_message, match_source, + match_confidence, match_is_authoritative, matched_metadata_json, match_candidates_json, + match_enrichment_json, dedupe_status, dedupe_reason, dedupe_message, + dedupe_group_key, duplicate_of_path, duplicate_of_item_id, dedupe_decision_json, + organize_status, organize_reason, organize_message, library_relative_path, + library_file_path, trash_file_path, organize_decision_json, + exception_resolution_status, exception_resolution_json, last_repair_task_id, + created_at, updated_at + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', + ( + task_id, + parent_item_id, + is_active, + original_path, + current_file_path, + relative_path, + filename, + extension, + size_bytes, + modified_at, + local_cover, + local_lyric, + scan_status, + scan_reason, + scan_message, + preprocess_status, + preprocess_reason, + preprocess_message, + json.dumps(audio_props_json) if audio_props_json is not None else None, + json.dumps(original_tags_json) if original_tags_json is not None else None, + json.dumps(preprocess_artifacts_json) if preprocess_artifacts_json is not None else None, + acoustic_fingerprint, + fingerprint_duration_seconds, + match_status, + match_reason, + match_message, + match_source, + match_confidence, + match_is_authoritative, + json.dumps(matched_metadata_json) if matched_metadata_json is not None else None, + json.dumps(match_candidates_json) if match_candidates_json is not None else None, + json.dumps(match_enrichment_json) if match_enrichment_json is not None else None, + dedupe_status, + dedupe_reason, + dedupe_message, + dedupe_group_key, + duplicate_of_path, + duplicate_of_item_id, + json.dumps(dedupe_decision_json) if dedupe_decision_json is not None else None, + organize_status, + organize_reason, + organize_message, + library_relative_path, + library_file_path, + trash_file_path, + json.dumps(organize_decision_json) if organize_decision_json is not None else None, + exception_resolution_status, + json.dumps(exception_resolution_json) if exception_resolution_json is not None else None, + last_repair_task_id, + timestamp, + timestamp + ) + ) + connection.commit() + row = connection.execute( + 'SELECT * FROM task_items WHERE id = ?', + (cursor.lastrowid,) + ).fetchone() + + return self._parse_item(row) + + def update_task_item(self, item_id: int, **fields) -> dict: + json_fields = { + 'audio_props_json', + 'original_tags_json', + 'preprocess_artifacts_json', + 'matched_metadata_json', + 'match_candidates_json', + 'match_enrichment_json', + 'dedupe_decision_json', + 'organize_decision_json', + 'exception_resolution_json' + } + assignments: list[str] = [] + values: list[object] = [] + + for key, value in fields.items(): + assignments.append(f'{key} = ?') + if key in json_fields and value is not None: + values.append(json.dumps(value)) + else: + values.append(value) + + assignments.append('updated_at = ?') + values.append(current_timestamp()) + values.append(item_id) + + with self._connect() as connection: + connection.execute( + f'UPDATE task_items SET {", ".join(assignments)} WHERE id = ?', + values + ) + connection.commit() + row = connection.execute( + 'SELECT * FROM task_items WHERE id = ?', + (item_id,) + ).fetchone() + + if row is None: + raise TaskNotFoundError(item_id) + + return self._parse_item(row) + + def list_preprocess_candidate_items(self, task_id: str) -> list[dict]: + with self._connect() as connection: + rows = connection.execute( + ''' + SELECT * + FROM task_items + WHERE task_id = ? + AND is_active = 1 + AND scan_status = 'queued' + AND preprocess_status IN ('pending', 'running') + ORDER BY created_at ASC, id ASC + ''', + (task_id,) + ).fetchall() + + return [self._parse_item(row) for row in rows] + + def list_match_candidate_items(self, task_id: str) -> list[dict]: + with self._connect() as connection: + rows = connection.execute( + ''' + SELECT * + FROM task_items + WHERE task_id = ? + AND is_active = 1 + AND scan_status = 'queued' + AND preprocess_status IN ('completed', 'warning') + AND match_status IN ('pending', 'running') + ORDER BY created_at ASC, id ASC + ''', + (task_id,) + ).fetchall() + + return [self._parse_item(row) for row in rows] + + def list_dedupe_candidate_items(self, task_id: str) -> list[dict]: + with self._connect() as connection: + rows = connection.execute( + ''' + SELECT * + FROM task_items + WHERE task_id = ? + AND is_active = 1 + AND match_status IN ('matched', 'matched_fallback') + AND dedupe_status IN ('pending', 'running') + ORDER BY created_at ASC, id ASC + ''', + (task_id,) + ).fetchall() + + return [self._parse_item(row) for row in rows] + + def list_organize_candidate_items(self, task_id: str) -> list[dict]: + with self._connect() as connection: + rows = connection.execute( + ''' + SELECT * + FROM task_items + WHERE task_id = ? + AND is_active = 1 + AND match_status IN ('matched', 'matched_fallback') + AND dedupe_status IN ('unique', 'duplicate_replaced') + AND organize_status IN ('pending', 'running') + ORDER BY created_at ASC, id ASC + ''', + (task_id,) + ).fetchall() + + return [self._parse_item(row) for row in rows] + + def list_task_items( + self, + task_id: str, + scan_status: str | None, + page: int, + page_size: int, + *, + preprocess_status: str | None = None, + match_status: str | None = None, + dedupe_status: str | None = None, + organize_status: str | None = None, + active_only: bool = False + ) -> dict: + offset = (page - 1) * page_size + where_clauses = ['task_id = ?'] + params: list[object] = [task_id] + + if scan_status: + where_clauses.append('scan_status = ?') + params.append(scan_status) + if preprocess_status: + where_clauses.append('preprocess_status = ?') + params.append(preprocess_status) + if match_status: + where_clauses.append('match_status = ?') + params.append(match_status) + if dedupe_status: + where_clauses.append('dedupe_status = ?') + params.append(dedupe_status) + if organize_status: + where_clauses.append('organize_status = ?') + params.append(organize_status) + if active_only: + where_clauses.append('is_active = 1') + + where_clause = f"WHERE {' AND '.join(where_clauses)}" + + with self._connect() as connection: + total = connection.execute( + f'SELECT COUNT(*) AS total FROM task_items {where_clause}', + params + ).fetchone()['total'] + rows = connection.execute( + f''' + SELECT * + FROM task_items + {where_clause} + ORDER BY created_at ASC, id ASC + LIMIT ? OFFSET ? + ''', + [*params, page_size, offset] + ).fetchall() + + return { + 'items': [self._parse_item(row) for row in rows], + 'page': page, + 'page_size': page_size, + 'total': total + } + + def list_all_task_items(self, task_id: str, *, active_only: bool = False) -> list[dict]: + where_clauses = ['task_id = ?'] + params: list[object] = [task_id] + if active_only: + where_clauses.append('is_active = 1') + + with self._connect() as connection: + rows = connection.execute( + f''' + SELECT * + FROM task_items + WHERE {' AND '.join(where_clauses)} + ORDER BY created_at ASC, id ASC + ''', + params + ).fetchall() + + return [self._parse_item(row) for row in rows] + + def list_task_history(self, page: int, page_size: int) -> dict: + offset = (page - 1) * page_size + + with self._connect() as connection: + total = connection.execute( + ''' + SELECT COUNT(*) AS total + FROM task_runs + WHERE status IN (?, ?) + AND task_type = 'ingest' + ''', + (TASK_STATUS_COMPLETED, TASK_STATUS_FAILED) + ).fetchone()['total'] + rows = connection.execute( + ''' + WITH task_item_stats AS ( + SELECT + task_id, + COUNT(*) AS total_items, + SUM(CASE WHEN organize_status = 'organized' THEN 1 ELSE 0 END) AS success_items + FROM task_items + GROUP BY task_id + ) + SELECT + task_runs.id AS task_id, + task_runs.started_at, + task_runs.status, + COALESCE(task_item_stats.total_items, 0) AS total_items, + COALESCE(task_item_stats.success_items, 0) AS success_items + FROM task_runs + LEFT JOIN task_item_stats ON task_item_stats.task_id = task_runs.id + WHERE task_runs.status IN (?, ?) + AND task_runs.task_type = 'ingest' + ORDER BY task_runs.started_at DESC, task_runs.id DESC + LIMIT ? OFFSET ? + ''', + (TASK_STATUS_COMPLETED, TASK_STATUS_FAILED, page_size, offset) + ).fetchall() + + return { + 'items': [self._parse_task_history_row(row) for row in rows], + 'page': page, + 'page_size': page_size, + 'total': total + } + + def list_task_logs(self, task_id: str, page: int, page_size: int) -> dict: + offset = (page - 1) * page_size + + with self._connect() as connection: + total = connection.execute( + 'SELECT COUNT(*) AS total FROM task_logs WHERE task_id = ?', + (task_id,) + ).fetchone()['total'] + rows = connection.execute( + ''' + SELECT * + FROM task_logs + WHERE task_id = ? + ORDER BY id ASC + LIMIT ? OFFSET ? + ''', + (task_id, page_size, offset) + ).fetchall() + + return { + 'logs': [self._parse_log(row) for row in rows], + 'page': page, + 'page_size': page_size, + 'total': total + } + + def list_library_provenance_items(self) -> list[dict]: + with self._connect() as connection: + rows = connection.execute( + ''' + SELECT + task_id, + library_file_path, + library_relative_path, + updated_at AS organized_at, + match_source, + match_confidence, + dedupe_status + FROM task_items + WHERE organize_status = 'organized' + ORDER BY updated_at DESC, id DESC + ''' + ).fetchall() + + return [ + { + 'task_id': row['task_id'], + 'library_file_path': row['library_file_path'], + 'library_relative_path': row['library_relative_path'], + 'organized_at': row['organized_at'], + 'match_source': row['match_source'], + 'match_confidence': row['match_confidence'], + 'dedupe_status': row['dedupe_status'] + } + for row in rows + ] + + def list_exception_source_items(self, resolution_status: str = 'open') -> list[dict]: + if resolution_status not in {'open', 'resolved', 'ignored', 'all', 'planned'}: + raise ValueError(f'Unsupported resolution status: {resolution_status}') + + resolution_clause = '' + params: list[object] = [] + if resolution_status == 'open': + resolution_clause = "AND task_items.exception_resolution_status IN ('open', 'planned')" + elif resolution_status != 'all': + resolution_clause = 'AND task_items.exception_resolution_status = ?' + params.append(resolution_status) + + with self._connect() as connection: + rows = connection.execute( + ''' + SELECT + task_items.*, + task_runs.started_at AS task_started_at + FROM task_items + JOIN task_runs ON task_runs.id = task_items.task_id + WHERE ( + task_items.organize_status IN ('trashed', 'failed') + OR task_items.dedupe_status IN ('duplicate_trashed', 'failed') + OR task_items.match_status IN ('low_score', 'failed', 'not_found') + OR ( + task_items.preprocess_status = 'failed' + AND task_items.preprocess_reason = 'convert_failed' + ) + OR ( + task_items.preprocess_status = 'warning' + AND task_items.preprocess_reason LIKE '%metadata_failed%' + ) + OR ( + task_items.exception_resolution_status IN ('open', 'planned') + AND task_items.exception_resolution_json IS NOT NULL + AND ( + task_items.exception_resolution_json LIKE '%"workflow_state"%candidate_selected%' + OR task_items.exception_resolution_json LIKE '%"workflow_state"%ready_to_ingest%' + ) + ) + ) + ''' + resolution_clause + ''' + ORDER BY task_items.updated_at DESC, task_items.id DESC + ''', + params + ).fetchall() + + return [self._parse_item_with_task_context(row) for row in rows] + + def get_exception_source_item(self, item_id: int) -> dict | None: + with self._connect() as connection: + row = connection.execute( + ''' + SELECT + task_items.*, + task_runs.started_at AS task_started_at + FROM task_items + JOIN task_runs ON task_runs.id = task_items.task_id + WHERE task_items.id = ? + ''', + (item_id,) + ).fetchone() + + if row is None: + return None + + return self._parse_item_with_task_context(row) + + def get_exception_source_items_by_ids(self, item_ids: list[int]) -> list[dict]: + if not item_ids: + return [] + + placeholders = ', '.join('?' for _ in item_ids) + with self._connect() as connection: + rows = connection.execute( + f''' + SELECT + task_items.*, + task_runs.started_at AS task_started_at + FROM task_items + JOIN task_runs ON task_runs.id = task_items.task_id + WHERE task_items.id IN ({placeholders}) + ''', + item_ids + ).fetchall() + + return [self._parse_item_with_task_context(row) for row in rows] + + def get_task_snapshot(self, task_id: str) -> dict: + task = self.get_task(task_id) + + with self._connect() as connection: + total_logs = connection.execute( + 'SELECT COUNT(*) AS total FROM task_logs WHERE task_id = ?', + (task_id,) + ).fetchone()['total'] + recent_rows = connection.execute( + ''' + SELECT * + FROM task_logs + WHERE task_id = ? + ORDER BY id DESC + LIMIT ? + ''', + (task_id, SCAN_PROGRESS_LOG_LIMIT) + ).fetchall() + + recent_logs = [self._parse_log(row) for row in reversed(recent_rows)] + + return { + 'task': task, + 'recent_logs': recent_logs, + 'recent_logs_limit': SCAN_PROGRESS_LOG_LIMIT, + 'has_more_logs': total_logs > len(recent_logs), + 'latest_log_id': recent_logs[-1]['id'] if recent_logs else None + } + + def _parse_task(self, row: sqlite3.Row | None) -> dict | None: + if row is None: + return None + + raw_stats = json.loads(row['stats_json']) + if row['task_type'] != TASK_TYPE_INGEST: + stats = { + 'prepare': { + **create_empty_repair_stats()['prepare'], + **(raw_stats.get('prepare') or {}) + }, + 'execute': { + **create_empty_repair_stats()['execute'], + **(raw_stats.get('execute') or {}) + } + } + elif 'scan' in raw_stats: + stats = { + 'scan': { + **create_empty_scan_stats(), + **(raw_stats.get('scan') or {}) + }, + 'preprocess': { + **create_empty_preprocess_stats(), + **(raw_stats.get('preprocess') or {}) + }, + 'match': { + **create_empty_match_stats(), + **(raw_stats.get('match') or {}) + }, + 'dedupe': { + **create_empty_dedupe_stats(), + **(raw_stats.get('dedupe') or {}) + }, + 'organize': { + **create_empty_organize_stats(), + **(raw_stats.get('organize') or {}) + } + } + else: + stats = { + 'scan': { + **create_empty_scan_stats(), + **raw_stats + }, + 'preprocess': create_empty_preprocess_stats(), + 'match': create_empty_match_stats(), + 'dedupe': create_empty_dedupe_stats(), + 'organize': create_empty_organize_stats() + } + + return { + 'task_id': row['id'], + 'task_type': row['task_type'] or TASK_TYPE_INGEST, + 'trigger_source': row['trigger_source'], + 'source_task_id': row['source_task_id'], + 'status': row['status'], + 'current_stage': row['current_stage'], + 'stage_states': json.loads(row['stage_states_json']), + 'stats': stats, + 'repair_plan_json': json.loads(row['repair_plan_json']) if row['repair_plan_json'] else None, + 'error_message': row['error_message'], + 'started_at': row['started_at'], + 'completed_at': row['completed_at'], + 'updated_at': row['updated_at'] + } + + def _parse_item(self, row: sqlite3.Row) -> dict: + return { + 'id': row['id'], + 'task_id': row['task_id'], + 'parent_item_id': row['parent_item_id'], + 'is_active': bool(row['is_active']), + 'original_path': row['original_path'], + 'current_file_path': row['current_file_path'] or row['original_path'], + 'relative_path': row['relative_path'], + 'filename': row['filename'], + 'extension': row['extension'], + 'size_bytes': row['size_bytes'], + 'modified_at': row['modified_at'], + 'local_cover': row['local_cover'], + 'local_lyric': row['local_lyric'], + 'scan_status': row['scan_status'], + 'scan_reason': row['scan_reason'], + 'scan_message': row['scan_message'], + 'preprocess_status': row['preprocess_status'] or 'pending', + 'preprocess_reason': row['preprocess_reason'], + 'preprocess_message': row['preprocess_message'], + 'audio_props_json': json.loads(row['audio_props_json']) if row['audio_props_json'] else None, + 'original_tags_json': json.loads(row['original_tags_json']) if row['original_tags_json'] else None, + 'preprocess_artifacts_json': ( + json.loads(row['preprocess_artifacts_json']) + if row['preprocess_artifacts_json'] + else None + ), + 'acoustic_fingerprint': row['acoustic_fingerprint'], + 'fingerprint_duration_seconds': row['fingerprint_duration_seconds'], + 'match_status': row['match_status'] or 'pending', + 'match_reason': row['match_reason'], + 'match_message': row['match_message'], + 'match_source': row['match_source'], + 'match_confidence': row['match_confidence'], + 'match_is_authoritative': bool(row['match_is_authoritative']), + 'matched_metadata_json': ( + json.loads(row['matched_metadata_json']) + if row['matched_metadata_json'] + else None + ), + 'match_candidates_json': ( + json.loads(row['match_candidates_json']) + if row['match_candidates_json'] + else None + ), + 'match_enrichment_json': ( + json.loads(row['match_enrichment_json']) + if row['match_enrichment_json'] + else None + ), + 'dedupe_status': row['dedupe_status'] or 'pending', + 'dedupe_reason': row['dedupe_reason'], + 'dedupe_message': row['dedupe_message'], + 'dedupe_group_key': row['dedupe_group_key'], + 'duplicate_of_path': row['duplicate_of_path'], + 'duplicate_of_item_id': row['duplicate_of_item_id'], + 'dedupe_decision_json': ( + json.loads(row['dedupe_decision_json']) + if row['dedupe_decision_json'] + else None + ), + 'organize_status': row['organize_status'] or 'pending', + 'organize_reason': row['organize_reason'], + 'organize_message': row['organize_message'], + 'library_relative_path': row['library_relative_path'], + 'library_file_path': row['library_file_path'], + 'trash_file_path': row['trash_file_path'], + 'organize_decision_json': ( + json.loads(row['organize_decision_json']) + if row['organize_decision_json'] + else None + ), + 'exception_resolution_status': row['exception_resolution_status'] or 'open', + 'exception_resolution_json': ( + json.loads(row['exception_resolution_json']) + if row['exception_resolution_json'] + else None + ), + 'last_repair_task_id': row['last_repair_task_id'], + 'created_at': row['created_at'], + 'updated_at': row['updated_at'] + } + + def _parse_item_with_task_context(self, row: sqlite3.Row) -> dict: + item = self._parse_item(row) + item['task_started_at'] = row['task_started_at'] + return item + + def _parse_log(self, row: sqlite3.Row) -> dict: + return { + 'id': row['id'], + 'task_id': row['task_id'], + 'stage': row['stage'], + 'level': row['level'], + 'event_type': row['event_type'], + 'message': row['message'], + 'payload': json.loads(row['payload_json']) if row['payload_json'] else None, + 'created_at': row['created_at'] + } + + def _parse_task_history_row(self, row: sqlite3.Row) -> dict: + total_items = row['total_items'] or 0 + success_items = row['success_items'] or 0 + exception_items = total_items - success_items + + return { + 'task_id': row['task_id'], + 'started_at': row['started_at'], + 'status': row['status'], + 'total_items': total_items, + 'success_items': success_items, + 'exception_items': exception_items, + 'report_status': ( + 'success' + if row['status'] == TASK_STATUS_COMPLETED and exception_items == 0 + else 'warning' + ) + } diff --git a/backend/app/task_stream.py b/backend/app/task_stream.py new file mode 100644 index 0000000..287e38e --- /dev/null +++ b/backend/app/task_stream.py @@ -0,0 +1,61 @@ +import asyncio +from collections import defaultdict +from typing import Any + +try: + from fastapi import WebSocket +except ModuleNotFoundError: + WebSocket = Any + +from .task_constants import current_timestamp + + +class TaskStreamManager: + def __init__(self): + self._loop: asyncio.AbstractEventLoop | None = None + self._connections: dict[str, set[WebSocket]] = defaultdict(set) + + def set_loop(self, loop: asyncio.AbstractEventLoop): + self._loop = loop + + async def connect(self, task_id: str, websocket: WebSocket): + await websocket.accept() + self._connections[task_id].add(websocket) + + def disconnect(self, task_id: str, websocket: WebSocket): + task_connections = self._connections.get(task_id) + if not task_connections: + return + + task_connections.discard(websocket) + if not task_connections: + self._connections.pop(task_id, None) + + def broadcast_event(self, task_id: str, event_type: str, stage: str, data: dict): + if self._loop is None: + return + + payload = { + 'type': event_type, + 'task_id': task_id, + 'stage': stage, + 'timestamp': current_timestamp(), + 'data': data + } + asyncio.run_coroutine_threadsafe( + self._broadcast(task_id, payload), + self._loop + ) + + async def _broadcast(self, task_id: str, payload: dict): + task_connections = list(self._connections.get(task_id, set())) + disconnected: list[WebSocket] = [] + + for websocket in task_connections: + try: + await websocket.send_json(payload) + except Exception: + disconnected.append(websocket) + + for websocket in disconnected: + self.disconnect(task_id, websocket) diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..336a1d9 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,2 @@ +fastapi==0.115.12 +uvicorn[standard]==0.34.2 diff --git a/backend/tests/test_config_api.py b/backend/tests/test_config_api.py new file mode 100644 index 0000000..14e3bc0 --- /dev/null +++ b/backend/tests/test_config_api.py @@ -0,0 +1,97 @@ +import os +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +os.environ['MUSIC_WORKSHOP_DB_PATH'] = str( + Path(tempfile.gettempdir()) / f'music_workshop_test_{next(tempfile._get_candidate_names())}.db' +) + +import backend.app.main as main_module +import backend.app.metadata_status as metadata_status_module +from backend.app.schemas import ConfigPayload +from backend.app.storage import ConfigStore + + +class ConfigApiTests(unittest.TestCase): + def setUp(self): + self.db_path = Path(os.environ['MUSIC_WORKSHOP_DB_PATH']) + if self.db_path.exists(): + self.db_path.unlink() + self.store = ConfigStore(self.db_path) + self.previous_store = main_module.store + main_module.store = self.store + + def tearDown(self): + main_module.store = self.previous_store + + def test_get_config_returns_defaults(self): + data = main_module.get_config() + + self.assertIn('advancedStrategy', data) + self.assertNotIn('metadataStatus', data) + self.assertEqual(data['schedule']['cron'], '0 2 * * *') + self.assertEqual(data['metadata']['acoustidUrl'], 'https://api.acoustid.org/v2') + + def test_put_config_persists_changes(self): + payload = self.store.get_config() + payload['input'] = '/tmp/input' + payload['advancedStrategy']['replaceLowQualityDuplicates'] = True + + expected_statuses = build_metadata_statuses() + with patch.object(main_module, 'probe_metadata_services', return_value=expected_statuses): + saved_payload = main_module.update_config(ConfigPayload.model_validate(payload)) + + read_payload = self.store.get_config() + + self.assertEqual(saved_payload['config']['input'], '/tmp/input') + self.assertEqual(read_payload['input'], '/tmp/input') + self.assertTrue(read_payload['advancedStrategy']['replaceLowQualityDuplicates']) + self.assertEqual(saved_payload['metadataStatus'], expected_statuses) + + def test_get_metadata_status_returns_probe_results(self): + expected_statuses = build_metadata_statuses() + + with patch.object(main_module, 'probe_metadata_services', return_value=expected_statuses) as probe: + response = main_module.get_config_metadata_status() + + self.assertEqual(response, {'metadataStatus': expected_statuses}) + probe.assert_called_once_with(self.store.get_config()['metadata']) + + def test_probe_metadata_services_skips_missing_credentials(self): + payload = self.store.get_config() + + with patch.object( + metadata_status_module, + 'probe_url', + return_value={'status': 'online', 'latencyMs': 100, 'message': '可达 (HTTP 200)'} + ) as probe_url: + statuses = metadata_status_module.probe_metadata_services(payload['metadata']) + + self.assertEqual(statuses['acoustid']['status'], 'none') + self.assertEqual(statuses['musicbrainz']['status'], 'online') + self.assertEqual(statuses['netease']['status'], 'online') + self.assertEqual(statuses['qq']['status'], 'online') + self.assertEqual(statuses['spotify']['status'], 'none') + self.assertEqual(statuses['discogs']['status'], 'none') + self.assertEqual(statuses['lastfm']['status'], 'none') + self.assertEqual(statuses['genius']['status'], 'none') + self.assertEqual(probe_url.call_count, 3) + + +def build_metadata_statuses(): + return { + 'acoustid': {'status': 'none', 'latencyMs': None, 'message': '缺失凭据,跳过测试'}, + 'musicbrainz': {'status': 'online', 'latencyMs': 123, 'message': '可达 (HTTP 200)'}, + 'netease': {'status': 'online', 'latencyMs': 42, 'message': '可达 (HTTP 200)'}, + 'qq': {'status': 'warning', 'latencyMs': 680, 'message': '高延迟 (HTTP 200)'}, + 'spotify': {'status': 'none', 'latencyMs': None, 'message': '缺失凭据,跳过测试'}, + 'discogs': {'status': 'none', 'latencyMs': None, 'message': '缺失凭据,跳过测试'}, + 'lastfm': {'status': 'none', 'latencyMs': None, 'message': '缺失凭据,跳过测试'}, + 'genius': {'status': 'none', 'latencyMs': None, 'message': '缺失凭据,跳过测试'} + } + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_exception_api.py b/backend/tests/test_exception_api.py new file mode 100644 index 0000000..f795ffb --- /dev/null +++ b/backend/tests/test_exception_api.py @@ -0,0 +1,216 @@ +import os +import tempfile +import unittest +from pathlib import Path + +try: + from fastapi.testclient import TestClient +except ModuleNotFoundError: + TestClient = None + +os.environ['MUSIC_WORKSHOP_DB_PATH'] = str( + Path(tempfile.gettempdir()) / f'music_workshop_exception_api_{next(tempfile._get_candidate_names())}.db' +) + +try: + from backend.app.exception_service import ExceptionItemNotFoundError + from backend.app.schemas import ( + ExceptionDetailPayload, + ExceptionListResponse, + ExceptionSummaryPayload + ) + import backend.app.main as main_module +except ModuleNotFoundError as error: + main_module = None + ExceptionItemNotFoundError = None + ExceptionDetailPayload = None + ExceptionListResponse = None + ExceptionSummaryPayload = None + FASTAPI_IMPORT_ERROR = error +else: + FASTAPI_IMPORT_ERROR = None + + +@unittest.skipIf(main_module is None, f'api deps unavailable: {FASTAPI_IMPORT_ERROR}') +class ExceptionApiTests(unittest.TestCase): + def setUp(self): + self.previous_service = main_module.exception_service + self.fake_service = _FakeExceptionService() + main_module.exception_service = self.fake_service + self.client = TestClient(main_module.app) if TestClient else None + + def tearDown(self): + main_module.exception_service = self.previous_service + + def test_get_exception_summary_serializes_payload(self): + response = main_module.get_exception_summary() + payload = ExceptionSummaryPayload.model_validate(response) + + self.assertEqual(payload.total, 6) + self.assertEqual(payload.counts_by_type['duplicates'], 2) + self.assertEqual(self.fake_service.summary_calls, 1) + + def test_get_exception_items_passes_filters_and_pagination(self): + response = main_module.get_exception_items( + type='duplicates', + resolution_status='resolved', + page=2, + page_size=25 + ) + payload = ExceptionListResponse.model_validate(response) + + self.assertEqual(payload.page, 2) + self.assertEqual(payload.page_size, 25) + self.assertEqual(payload.total, 1) + self.assertEqual(payload.items[0].exception_id, 101) + self.assertEqual(payload.items[0].exception_type, 'duplicates') + self.assertEqual( + self.fake_service.list_calls, + [{'type': 'duplicates', 'resolution_status': 'resolved', 'page': 2, 'page_size': 25}] + ) + + def test_get_exception_item_serializes_detail_payload(self): + response = main_module.get_exception_item(101) + payload = ExceptionDetailPayload.model_validate(response) + + self.assertEqual(payload.exception_id, 101) + self.assertEqual(payload.filename, 'duplicate.flac') + self.assertEqual(payload.dedupe_decision_json['comparison_scope'], 'library') + self.assertEqual(self.fake_service.detail_calls, [101]) + + def test_get_exception_item_not_found_raises_service_error(self): + with self.assertRaises(ExceptionItemNotFoundError): + main_module.get_exception_item(999) + + response = main_module.exception_item_not_found_error_handler( + None, + ExceptionItemNotFoundError(999) + ) + self.assertEqual(response.status_code, 404) + + def test_streams_exception_audio_with_range_support(self): + if self.client is None: + self.skipTest('fastapi test client unavailable') + audio_path = Path(tempfile.gettempdir()) / f'exception-audio-{next(tempfile._get_candidate_names())}.mp3' + audio_path.write_bytes(b'0123456789abcdef') + self.fake_service.audio_path = audio_path + try: + full_response = self.client.get('/api/exceptions/items/101/audio') + self.assertEqual(full_response.status_code, 200) + self.assertEqual(full_response.content, b'0123456789abcdef') + self.assertEqual(full_response.headers['accept-ranges'], 'bytes') + + range_response = self.client.get( + '/api/exceptions/items/101/audio', + headers={'Range': 'bytes=4-7'} + ) + self.assertEqual(range_response.status_code, 206) + self.assertEqual(range_response.content, b'4567') + self.assertEqual(range_response.headers['content-range'], 'bytes 4-7/16') + finally: + if audio_path.exists(): + audio_path.unlink() + + +class _FakeExceptionService: + def __init__(self): + self.summary_calls = 0 + self.list_calls: list[dict] = [] + self.detail_calls: list[int] = [] + self.audio_path: Path | None = None + + def get_summary(self) -> dict: + self.summary_calls += 1 + return { + 'total': 6, + 'counts_by_type': { + 'missing_tags': 1, + 'duplicates': 2, + 'match_failed': 1, + 'low_score': 1, + 'convert_failed': 0, + 'organize_failed': 1 + }, + 'scanned_at': '2024-01-03T12:00:00Z' + } + + def get_items( + self, + exception_type: str = 'all', + page: int = 1, + page_size: int = 50, + resolution_status: str = 'open' + ) -> dict: + self.list_calls.append( + { + 'type': exception_type, + 'resolution_status': resolution_status, + 'page': page, + 'page_size': page_size + } + ) + return { + 'items': [self._detail_payload()], + 'page': page, + 'page_size': page_size, + 'total': 1 + } + + def get_item(self, exception_id: int) -> dict: + self.detail_calls.append(exception_id) + if exception_id != 101: + raise ExceptionItemNotFoundError(exception_id) + return self._detail_payload() + + def resolve_audio_path(self, exception_id: int) -> Path: + self.detail_calls.append(exception_id) + if exception_id != 101 or self.audio_path is None: + raise FileNotFoundError(f'No playable audio found for exception item: {exception_id}') + return self.audio_path + + def _detail_payload(self) -> dict: + return { + 'exception_id': 101, + 'task_id': 'task-123', + 'task_started_at': '2024-01-01T08:00:00Z', + 'exception_type': 'duplicates', + 'exception_stage': 'dedupe', + 'exception_reason_code': 'library_duplicate', + 'exception_message': '输出库中已存在重复文件,保留库内文件', + 'captured_at': '2024-01-03T12:00:00Z', + 'filename': 'duplicate.flac', + 'relative_path': 'Artist/Album/duplicate.flac', + 'original_path': '/tmp/input/duplicate.flac', + 'current_file_path': '/tmp/trash/duplicate.flac', + 'trash_file_path': '/tmp/trash/duplicate.flac', + 'audio_props_json': {'codec': 'FLAC'}, + 'original_tags_json': {'title': 'Song'}, + 'matched_metadata_json': {'title': 'Song'}, + 'duplicate_of_path': '/tmp/output/Artist/Old.flac', + 'dedupe_decision_json': {'comparison_scope': 'library'}, + 'library_relative_path': None, + 'library_file_path': None, + 'match_source': 'musicbrainz', + 'match_confidence': 91.2, + 'preview_available': False, + 'available_actions': [], + 'exception_resolution_status': 'open', + 'exception_resolution_json': None, + 'workflow_state': 'open', + 'raw_metadata': {'title': 'Song'}, + 'metadata_draft': {'title': 'Song'}, + 'effective_metadata': {'title': 'Song', 'artist': 'Artist', 'album_artist': 'Artist'}, + 'can_ingest': True, + 'pending_ingest': False, + 'display_title': 'Song', + 'display_reason': '输出库中已存在重复文件,保留库内文件', + 'type_label': '文件重复', + 'preprocess_artifacts_json': None, + 'match_candidates_json': None, + 'match_enrichment_json': None, + 'organize_decision_json': None + } + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_exception_service.py b/backend/tests/test_exception_service.py new file mode 100644 index 0000000..872a23c --- /dev/null +++ b/backend/tests/test_exception_service.py @@ -0,0 +1,440 @@ +import os +import tempfile +import unittest +from pathlib import Path + +os.environ['MUSIC_WORKSHOP_DB_PATH'] = str( + Path(tempfile.gettempdir()) / f'music_workshop_exception_service_{next(tempfile._get_candidate_names())}.db' +) + +from backend.app.exception_service import ExceptionItemNotFoundError, ExceptionService +from backend.app.task_store import TaskStore + + +class ExceptionServiceTests(unittest.TestCase): + def setUp(self): + self.db_path = Path(os.environ['MUSIC_WORKSHOP_DB_PATH']) + if self.db_path.exists(): + self.db_path.unlink() + self.task_store = TaskStore(self.db_path) + self.service = ExceptionService(self.task_store) + self.task = self.task_store.create_task_if_idle( + { + 'input': '/tmp/input', + 'output': '/tmp/output', + 'trash': '/tmp/trash' + } + ) + + def test_empty_summary_list_and_detail_not_found(self): + summary = self.service.get_summary() + self.assertEqual(summary['total'], 0) + self.assertEqual( + summary['counts_by_type'], + { + 'missing_tags': 0, + 'duplicates': 0, + 'match_failed': 0, + 'low_score': 0, + 'convert_failed': 0, + 'organize_failed': 0 + } + ) + + page = self.service.get_items() + self.assertEqual(page['items'], []) + self.assertEqual(page['total'], 0) + + with self.assertRaises(ExceptionItemNotFoundError): + self.service.get_item(9999) + + def test_maps_exception_types_and_applies_priority(self): + missing_tags = self._insert_item( + filename='missing-tags.flac', + preprocess_status='warning', + preprocess_reason='cover_missing,metadata_failed', + preprocess_message='无法提取有效元数据' + ) + low_score = self._insert_item( + filename='low-score.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Likely Match'} + ) + match_failed = self._insert_item( + filename='match-failed.flac', + match_status='not_found', + match_reason='no_candidate', + match_message='MusicBrainz 查无此曲' + ) + convert_failed = self._insert_item( + filename='convert-failed.flac', + preprocess_status='failed', + preprocess_reason='convert_failed', + preprocess_message='音频转码失败' + ) + duplicate = self._insert_item( + filename='duplicate.flac', + dedupe_status='duplicate_trashed', + dedupe_reason='library_duplicate', + dedupe_message='输出库中已存在重复文件,保留库内文件', + duplicate_of_path='/tmp/output/Artist/Old.flac', + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': 'recording_id', + 'compared_candidates': [ + {'side': 'kept', 'path': '/tmp/output/Artist/Old.flac'}, + {'side': 'trashed', 'path': '/tmp/input/duplicate.flac'} + ] + }, + trash_file_path='/tmp/trash/duplicates/task-1/duplicate.flac' + ) + organize_failed = self._insert_item( + filename='organize-failed.flac', + organize_status='failed', + organize_reason='target_conflict', + organize_message='整理入库失败' + ) + priority_item = self._insert_item( + filename='priority.flac', + preprocess_status='failed', + preprocess_reason='convert_failed', + preprocess_message='音频转码失败', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + dedupe_status='failed', + dedupe_reason='trash_move_failed', + dedupe_message='重复检测失败', + organize_status='trashed', + organize_reason='manual_review', + organize_message='已移入回收站等待人工处理' + ) + + summary = self.service.get_summary() + self.assertEqual(summary['total'], 7) + self.assertEqual(summary['counts_by_type']['missing_tags'], 1) + self.assertEqual(summary['counts_by_type']['duplicates'], 1) + self.assertEqual(summary['counts_by_type']['match_failed'], 1) + self.assertEqual(summary['counts_by_type']['low_score'], 1) + self.assertEqual(summary['counts_by_type']['convert_failed'], 1) + self.assertEqual(summary['counts_by_type']['organize_failed'], 2) + + items = self.service.get_items(page_size=20)['items'] + indexed = {item['filename']: item for item in items} + + self.assertEqual(indexed['missing-tags.flac']['exception_type'], 'missing_tags') + self.assertEqual( + indexed['missing-tags.flac']['available_actions'], + ['retry_match', 'edit_metadata', 'save_and_organize', 'ignore_exception', 'delete_file'] + ) + self.assertEqual(indexed['low-score.flac']['display_title'], 'Likely Match') + self.assertEqual(indexed['low-score.flac']['exception_type'], 'low_score') + self.assertFalse(indexed['low-score.flac']['can_ingest']) + self.assertEqual(indexed['low-score.flac']['workflow_state'], 'open') + self.assertEqual(indexed['match-failed.flac']['exception_type'], 'match_failed') + self.assertEqual(indexed['convert-failed.flac']['exception_type'], 'convert_failed') + self.assertEqual(indexed['duplicate.flac']['exception_type'], 'duplicates') + self.assertEqual(indexed['organize-failed.flac']['exception_type'], 'organize_failed') + self.assertEqual(indexed['priority.flac']['exception_type'], 'organize_failed') + self.assertEqual(indexed['priority.flac']['exception_stage'], 'organize') + self.assertEqual(indexed['priority.flac']['exception_reason_code'], 'manual_review') + self.assertEqual( + indexed['priority.flac']['available_actions'], + ['edit_target_path', 'move_to_review_trash', 'ignore_exception', 'delete_file'] + ) + + self.assertEqual(missing_tags['id'], indexed['missing-tags.flac']['exception_id']) + self.assertEqual(low_score['id'], indexed['low-score.flac']['exception_id']) + self.assertEqual(match_failed['id'], indexed['match-failed.flac']['exception_id']) + self.assertEqual(convert_failed['id'], indexed['convert-failed.flac']['exception_id']) + self.assertEqual(duplicate['id'], indexed['duplicate.flac']['exception_id']) + self.assertEqual(organize_failed['id'], indexed['organize-failed.flac']['exception_id']) + self.assertEqual(priority_item['id'], indexed['priority.flac']['exception_id']) + + def test_duplicate_detail_preserves_comparison_data(self): + duplicate = self._insert_item( + filename='duplicate.flac', + dedupe_status='duplicate_trashed', + dedupe_reason='library_duplicate', + dedupe_message='输出库中已存在重复文件,保留库内文件', + duplicate_of_path='/tmp/output/Artist/Old.flac', + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': 'recording_id', + 'quality_breakdown': { + 'kept': {'total': 88.0}, + 'trashed': {'total': 72.0} + }, + 'compared_candidates': [ + {'side': 'kept', 'path': '/tmp/output/Artist/Old.flac', 'quality_score': 88.0}, + {'side': 'trashed', 'path': '/tmp/input/duplicate.flac', 'quality_score': 72.0} + ] + }, + trash_file_path='/tmp/trash/duplicates/task-1/duplicate.flac' + ) + + detail = self.service.get_item(duplicate['id']) + self.assertEqual(detail['exception_type'], 'duplicates') + self.assertEqual(detail['duplicate_of_path'], '/tmp/output/Artist/Old.flac') + self.assertEqual(detail['trash_file_path'], '/tmp/trash/duplicates/task-1/duplicate.flac') + self.assertEqual(detail['dedupe_decision_json']['comparison_scope'], 'library') + self.assertEqual(len(detail['dedupe_decision_json']['compared_candidates']), 2) + self.assertFalse(detail['preview_available']) + + def test_resolution_filter_hides_resolved_by_default(self): + resolved_item = self._insert_item( + filename='resolved.flac', + match_status='not_found', + match_reason='no_candidate', + match_message='未找到匹配', + exception_resolution_status='resolved', + exception_resolution_json={ + 'before_snapshot': { + 'exception_type': 'match_failed', + 'exception_stage': 'match', + 'exception_reason_code': 'no_candidate', + 'exception_message': '未找到匹配' + } + } + ) + open_item = self._insert_item( + filename='open.flac', + match_status='failed', + match_reason='provider_error', + match_message='匹配失败' + ) + + open_page = self.service.get_items() + resolved_page = self.service.get_items(resolution_status='resolved') + + self.assertEqual([item['exception_id'] for item in open_page['items']], [open_item['id']]) + self.assertEqual([item['exception_id'] for item in resolved_page['items']], [resolved_item['id']]) + + def test_candidate_selected_item_remains_open_and_pending_ingest(self): + item = self._insert_item( + filename='candidate-selected.flac', + match_status='matched_fallback', + match_reason='manual_candidate_selected', + match_message='已手动确认匹配候选', + matched_metadata_json={'title': 'Song', 'artist': 'Artist', 'album_artist': 'Artist'}, + exception_resolution_json={ + 'workflow_state': 'candidate_selected', + 'metadata_draft': {'title': 'Song', 'artist': 'Artist', 'album_artist': 'Artist'}, + 'before_snapshot': { + 'exception_type': 'low_score', + 'exception_stage': 'match', + 'exception_reason_code': 'score_gap_too_small', + 'exception_message': '匹配候选分数过低' + } + } + ) + + open_page = self.service.get_items() + indexed = {row['filename']: row for row in open_page['items']} + detail = indexed['candidate-selected.flac'] + + self.assertEqual(item['id'], detail['exception_id']) + self.assertEqual(detail['workflow_state'], 'ready_to_ingest') + self.assertTrue(detail['pending_ingest']) + self.assertTrue(detail['can_ingest']) + self.assertEqual(detail['exception_type'], 'low_score') + self.assertIn('save_and_organize', detail['available_actions']) + + def test_effective_metadata_derives_album_artist_for_ingest(self): + item = self._insert_item( + filename='derived-album-artist.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={ + 'title': 'Song', + 'artist': 'Artist A feat. Guest', + 'album': 'Album X' + } + ) + self._insert_item( + filename='derived-album-artist-2.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={ + 'title': 'Song 2', + 'artist': 'Artist A', + 'album': 'Album X' + } + ) + + detail = self.service.get_item(item['id']) + self.assertEqual(detail['effective_metadata']['album_artist'], 'Artist A') + self.assertEqual(detail['normalization_strategy'], 'main_artist_feat') + self.assertTrue(detail['can_ingest']) + self.assertEqual(detail['workflow_state'], 'ready_to_ingest') + + def test_filters_and_paginates_by_captured_at_desc(self): + older_duplicate = self._insert_item( + filename='older-duplicate.flac', + dedupe_status='duplicate_trashed', + dedupe_reason='library_duplicate', + dedupe_message='重复文件' + ) + newest_match_failed = self._insert_item( + filename='newest-match-failed.flac', + match_status='failed', + match_reason='provider_error', + match_message='匹配服务请求失败' + ) + middle_convert_failed = self._insert_item( + filename='middle-convert-failed.flac', + preprocess_status='failed', + preprocess_reason='convert_failed', + preprocess_message='音频转码失败' + ) + + self._set_updated_at(older_duplicate['id'], '2024-01-01T00:00:00Z') + self._set_updated_at(middle_convert_failed['id'], '2024-01-02T00:00:00Z') + self._set_updated_at(newest_match_failed['id'], '2024-01-03T00:00:00Z') + + first_page = self.service.get_items(page=1, page_size=2) + second_page = self.service.get_items(page=2, page_size=2) + duplicate_page = self.service.get_items('duplicates', page=1, page_size=10) + + self.assertEqual(first_page['total'], 3) + self.assertEqual( + [item['filename'] for item in first_page['items']], + ['newest-match-failed.flac', 'middle-convert-failed.flac'] + ) + self.assertEqual([item['filename'] for item in second_page['items']], ['older-duplicate.flac']) + self.assertEqual(duplicate_page['total'], 1) + self.assertEqual(duplicate_page['items'][0]['filename'], 'older-duplicate.flac') + + def test_summary_counts_without_triggering_metadata_normalization(self): + self._insert_item( + filename='low-score.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Song'} + ) + self._insert_item( + filename='match-failed.flac', + match_status='failed', + match_reason='provider_error', + match_message='匹配服务请求失败' + ) + + def fail_normalize(*args, **kwargs): + raise AssertionError('get_summary should not normalize metadata') + + self.service.metadata_normalizer.normalize_item = fail_normalize + + summary = self.service.get_summary() + + self.assertEqual(summary['total'], 2) + self.assertEqual(summary['counts_by_type']['low_score'], 1) + self.assertEqual(summary['counts_by_type']['match_failed'], 1) + + def test_get_items_only_normalizes_current_page(self): + first_item = self._insert_item( + filename='page-1.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Page 1', 'artist': 'Artist A', 'album': 'Album X'} + ) + second_item = self._insert_item( + filename='page-2.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Page 2', 'artist': 'Artist A', 'album': 'Album X'} + ) + third_item = self._insert_item( + filename='page-3.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Page 3', 'artist': 'Artist A', 'album': 'Album X'} + ) + + self._set_updated_at(first_item['id'], '2030-01-03T00:00:00Z') + self._set_updated_at(second_item['id'], '2030-01-02T00:00:00Z') + self._set_updated_at(third_item['id'], '2030-01-01T00:00:00Z') + calls = [] + original_normalize_item = self.service.metadata_normalizer.normalize_item + + def tracked_normalize(item, metadata_patch=None, cache=None): + calls.append(item['id']) + return original_normalize_item(item, metadata_patch, cache) + + self.service.metadata_normalizer.normalize_item = tracked_normalize + + page = self.service.get_items(page=1, page_size=1) + + self.assertEqual(page['total'], 3) + self.assertEqual([item['filename'] for item in page['items']], ['page-1.flac']) + self.assertEqual(calls, [first_item['id']]) + + def test_get_items_reuses_task_level_normalization_cache_within_page(self): + list_all_calls = [] + original_list_all_task_items = self.task_store.list_all_task_items + + def tracked_list_all_task_items(task_id, active_only=True): + list_all_calls.append((task_id, active_only)) + return original_list_all_task_items(task_id, active_only=active_only) + + self.task_store.list_all_task_items = tracked_list_all_task_items + + self._insert_item( + filename='shared-1.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Shared 1', 'artist': 'Artist A feat. Guest', 'album': 'Album X'} + ) + self._insert_item( + filename='shared-2.flac', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配候选分数过低', + matched_metadata_json={'title': 'Shared 2', 'artist': 'Artist A', 'album': 'Album X'} + ) + + page = self.service.get_items(page=1, page_size=2) + + self.assertEqual(len(page['items']), 2) + self.assertEqual(len(list_all_calls), 1) + self.assertTrue(all(item['can_ingest'] for item in page['items'])) + + def _insert_item(self, **overrides): + filename = overrides.pop('filename', f'item-{next(tempfile._get_candidate_names())}.flac') + extension = Path(filename).suffix or '.flac' + return self.task_store.insert_task_item( + self.task['task_id'], + original_path=f'/tmp/input/{filename}', + current_file_path=f'/tmp/input/{filename}', + relative_path=f'Artist/Album/{filename}', + filename=filename, + extension=extension, + size_bytes=123456, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + **overrides + ) + + def _set_updated_at(self, item_id: int, timestamp: str): + with self.task_store._connect() as connection: + connection.execute( + 'UPDATE task_items SET updated_at = ? WHERE id = ?', + (timestamp, item_id) + ) + connection.commit() + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_library_api.py b/backend/tests/test_library_api.py new file mode 100644 index 0000000..7b6ccfb --- /dev/null +++ b/backend/tests/test_library_api.py @@ -0,0 +1,200 @@ +import os +import tempfile +import unittest +from pathlib import Path + +os.environ['MUSIC_WORKSHOP_DB_PATH'] = str( + Path(tempfile.gettempdir()) / f'music_workshop_library_api_{next(tempfile._get_candidate_names())}.db' +) + +from backend.app.storage import ConfigStore +from backend.app.task_store import TaskConflictError + +try: + from backend.app.library_service import LibraryTrackNotFoundError + from backend.app.schemas import ( + LibraryMoveToExceptionResponse, + LibrarySummaryPayload, + LibraryTracksPageResponse + ) + import backend.app.main as main_module +except ModuleNotFoundError as error: + main_module = None + LibraryTrackNotFoundError = None + LibraryMoveToExceptionResponse = None + LibrarySummaryPayload = None + LibraryTracksPageResponse = None + FASTAPI_IMPORT_ERROR = error +else: + FASTAPI_IMPORT_ERROR = None + + +@unittest.skipIf(main_module is None, f'api deps unavailable: {FASTAPI_IMPORT_ERROR}') +class LibraryApiTests(unittest.TestCase): + def setUp(self): + self.db_path = Path(os.environ['MUSIC_WORKSHOP_DB_PATH']) + if self.db_path.exists(): + self.db_path.unlink() + self.store = ConfigStore(self.db_path) + config = self.store.get_config() + config['output'] = '/tmp/library-output' + self.store.save_config(config) + self.previous_store = main_module.store + self.previous_service = main_module.library_service + self.fake_service = _FakeLibraryService() + main_module.store = self.store + main_module.library_service = self.fake_service + + def tearDown(self): + main_module.store = self.previous_store + main_module.library_service = self.previous_service + + def test_get_library_summary_uses_current_output_config(self): + response = main_module.get_library_summary() + payload = LibrarySummaryPayload.model_validate(response) + + self.assertEqual(payload.total_tracks, 12) + self.assertEqual(payload.total_albums, 3) + self.assertEqual(payload.total_artists, 2) + self.assertEqual(self.fake_service.summary_calls, ['/tmp/library-output']) + + def test_get_library_tracks_passes_pagination_filters_and_serializes_provenance(self): + response = main_module.get_library_tracks( + q='echoes', + artist='Artist A', + album='Album A', + format='FLAC', + has_provenance=True, + page=2, + page_size=25, + sort_by='filename', + sort_order='asc' + ) + payload = LibraryTracksPageResponse.model_validate(response) + + self.assertEqual(payload.page, 2) + self.assertEqual(payload.page_size, 25) + self.assertEqual(payload.total, 1) + self.assertEqual(payload.items[0].track_id, 'track-1') + self.assertEqual(payload.items[0].ingest_provenance.task_id, 'task-123') + self.assertEqual( + self.fake_service.track_calls, + [ + { + 'output_dir': '/tmp/library-output', + 'q': 'echoes', + 'artist': 'Artist A', + 'album': 'Album A', + 'format': 'FLAC', + 'has_provenance': True, + 'page': 2, + 'page_size': 25, + 'sort_by': 'filename', + 'sort_order': 'asc' + } + ] + ) + + def test_move_library_track_to_exception_uses_current_config(self): + response = main_module.move_library_track_to_exception('track-1') + payload = LibraryMoveToExceptionResponse.model_validate(response) + + self.assertEqual(payload.exception_id, 123) + self.assertEqual(payload.library_relative_path, 'A/Artist A/Album A/01 - Echoes.flac') + self.assertEqual( + self.fake_service.move_calls, + [{'output': '/tmp/library-output', 'trash': '/volume1/docker/navidrome/trash', 'track_id': 'track-1'}] + ) + + def test_move_library_track_to_exception_maps_conflict_to_409(self): + self.fake_service.move_error = TaskConflictError('active-task') + + response = main_module.move_library_track_to_exception('track-1') + + self.assertEqual(response.status_code, 409) + + def test_library_track_not_found_handler_returns_404(self): + response = main_module.library_track_not_found_error_handler( + None, + LibraryTrackNotFoundError('missing-track') + ) + + self.assertEqual(response.status_code, 404) + + +class _FakeLibraryService: + def __init__(self): + self.summary_calls: list[str] = [] + self.track_calls: list[dict] = [] + self.move_calls: list[dict] = [] + self.move_error = None + + def get_summary(self, output_dir: str) -> dict: + self.summary_calls.append(output_dir) + return { + 'total_tracks': 12, + 'total_albums': 3, + 'total_artists': 2, + 'suspected_duplicates': 1, + 'scanned_at': '2024-01-03T12:00:00Z' + } + + def get_tracks_page(self, output_dir: str, **kwargs) -> dict: + self.track_calls.append({'output_dir': output_dir, **kwargs}) + return { + 'items': [ + { + 'track_id': 'track-1', + 'library_relative_path': 'A/Artist A/Album A/01 - Echoes.flac', + 'library_file_path': '/tmp/library-output/A/Artist A/Album A/01 - Echoes.flac', + 'filename': '01 - Echoes.flac', + 'title': 'Echoes', + 'artist': 'Artist A', + 'album': 'Album A', + 'album_artist': 'Artist A', + 'track_number': 1, + 'disc_number': 1, + 'year': 2024, + 'duration_seconds': 301.4, + 'format': 'FLAC', + 'codec': 'FLAC', + 'bitrate': 980000, + 'sample_rate': 96000, + 'bit_depth': 24, + 'channels': 2, + 'size_bytes': 12345678, + 'modified_at': '2024-01-02T12:00:00Z', + 'ingest_provenance': { + 'task_id': 'task-123', + 'organized_at': '2024-01-03T12:00:00Z', + 'match_source': 'musicbrainz', + 'match_confidence': 95.2, + 'dedupe_status': 'unique' + } + } + ], + 'page': kwargs['page'], + 'page_size': kwargs['page_size'], + 'total': 1 + } + + def move_track_to_exception(self, config_snapshot: dict, track_id: str) -> dict: + if self.move_error: + raise self.move_error + self.move_calls.append( + { + 'output': config_snapshot['output'], + 'trash': config_snapshot['trash'], + 'track_id': track_id + } + ) + return { + 'exception_id': 123, + 'library_relative_path': 'A/Artist A/Album A/01 - Echoes.flac', + 'trash_file_path': '/tmp/trash/match_failed/task-1/01 - Echoes.flac', + 'message': '已移入异常中心,等待重新匹配' + } + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_library_postprocess.py b/backend/tests/test_library_postprocess.py new file mode 100644 index 0000000..d5010f4 --- /dev/null +++ b/backend/tests/test_library_postprocess.py @@ -0,0 +1,464 @@ +import shutil +import tempfile +import unittest +from pathlib import Path + +from backend.app.library_postprocess import DedupeRunner, OrganizeRunner +from backend.app.task_constants import create_empty_task_stats +from backend.app.task_store import TaskStore +from backend.app.task_stream import TaskStreamManager + + +class DedupeRunnerTests(unittest.TestCase): + def setUp(self): + self.root = Path(tempfile.mkdtemp()) + self.input_dir = self.root / 'input' + self.output_dir = self.root / 'output' + self.trash_dir = self.root / 'trash' + self.input_dir.mkdir() + self.output_dir.mkdir() + self.trash_dir.mkdir() + self.task_store = TaskStore(self.root / 'music_workshop.db') + self.runner = DedupeRunner(self.task_store, _NoopPreprocessor(), TaskStreamManager()) + self.runner._safe_probe_audio = lambda file_path: self.library_audio_props.get(file_path, {}) + self.runner._safe_read_library_tags = lambda file_path: self.library_tags.get(file_path, {}) + self.library_audio_props = {} + self.library_tags = {} + + def test_trashes_lower_quality_batch_duplicate(self): + task = self._create_task() + first_path = self._write_source('Artist/Album/01.flac') + second_path = self._write_source('Artist/Album/01-copy.flac') + + first_item = self._insert_matched_item( + task['task_id'], + first_path, + recording_id='recording-1', + confidence=88.0, + audio_props={'codec': 'FLAC', 'bit_depth': 16, 'sample_rate': 44100, 'bitrate': 900000, 'channels': 2, 'duration_seconds': 201} + ) + second_item = self._insert_matched_item( + task['task_id'], + second_path, + recording_id='recording-1', + confidence=95.0, + audio_props={'codec': 'FLAC', 'bit_depth': 24, 'sample_rate': 96000, 'bitrate': 1500000, 'channels': 2, 'duration_seconds': 201} + ) + + stats = create_empty_task_stats() + self.runner.run(task['task_id'], stats, self._config()) + + first_item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + second_item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][1] + + self.assertEqual(first_item['dedupe_status'], 'duplicate_trashed') + self.assertFalse(first_item['is_active']) + self.assertEqual(first_item['duplicate_of_item_id'], second_item['id']) + self.assertTrue(Path(first_item['trash_file_path']).exists()) + self.assertEqual(second_item['dedupe_status'], 'unique') + self.assertEqual(stats['dedupe']['batch_duplicates'], 1) + self.assertEqual(stats['dedupe']['kept_items'], 1) + log_types = { + log['event_type'] + for log in self.task_store.list_task_logs(task['task_id'], 1, 50)['logs'] + } + self.assertIn('dedupe.lookup_started', log_types) + self.assertIn('dedupe.item_duplicate', log_types) + self.assertIn('dedupe.item_unique', log_types) + + def test_keeps_existing_library_file_by_default(self): + task = self._create_task() + source_path = self._write_source('Artist/Album/01.flac') + library_path = self._write_library('A/Artist/Album/01 - Song.flac') + self.library_audio_props[str(library_path)] = { + 'codec': 'FLAC', + 'bit_depth': 16, + 'sample_rate': 44100, + 'bitrate': 700000, + 'channels': 2, + 'duration_seconds': 201 + } + self.library_tags[str(library_path)] = { + 'title': 'Song', + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1', + 'discnumber': '1', + 'musicbrainzrecordingid': 'recording-1', + 'musicbrainzalbumid': 'release-1', + 'date': '2024-01-01' + } + + item = self._insert_matched_item(task['task_id'], source_path, recording_id='recording-1') + stats = create_empty_task_stats() + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['dedupe_status'], 'duplicate_trashed') + self.assertEqual(item['duplicate_of_path'], str(library_path.resolve(strict=False))) + self.assertTrue(Path(item['trash_file_path']).exists()) + self.assertEqual(stats['dedupe']['library_duplicates'], 1) + self.assertEqual(stats['dedupe']['replaced_library_items'], 0) + + def test_replaces_lower_quality_library_file_when_enabled(self): + task = self._create_task(replace=True) + source_path = self._write_source('Artist/Album/01.flac') + library_path = self._write_library('A/Artist/Album/01 - Song.flac') + self.library_audio_props[str(library_path)] = { + 'codec': 'MP3', + 'bit_depth': 16, + 'sample_rate': 44100, + 'bitrate': 128000, + 'channels': 2, + 'duration_seconds': 201 + } + self.library_tags[str(library_path)] = { + 'title': 'Song', + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1', + 'discnumber': '1', + 'musicbrainzrecordingid': 'recording-1', + 'musicbrainzalbumid': 'release-1', + 'date': '2024-01-01' + } + + item = self._insert_matched_item( + task['task_id'], + source_path, + recording_id='recording-1', + confidence=96.0, + audio_props={'codec': 'FLAC', 'bit_depth': 24, 'sample_rate': 96000, 'bitrate': 1600000, 'channels': 2, 'duration_seconds': 201} + ) + stats = create_empty_task_stats() + self.runner.run(task['task_id'], stats, self._config(replace=True)) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['dedupe_status'], 'duplicate_replaced') + self.assertTrue(Path(item['current_file_path']).exists()) + self.assertFalse(library_path.exists()) + self.assertEqual(item['duplicate_of_path'], str(library_path.resolve(strict=False))) + self.assertEqual(stats['dedupe']['replaced_library_items'], 1) + self.assertEqual(stats['dedupe']['kept_items'], 1) + + def test_version_mismatch_does_not_dedupe_on_text_key(self): + task = self._create_task() + source_path = self._write_source('Artist/Album/01.flac') + library_path = self._write_library('A/Artist/Singles/2024 - Song/01 - Song.flac') + self.library_audio_props[str(library_path)] = { + 'codec': 'FLAC', + 'bit_depth': 16, + 'sample_rate': 44100, + 'bitrate': 700000, + 'channels': 2, + 'duration_seconds': 201 + } + self.library_tags[str(library_path)] = { + 'title': 'Song', + 'artist': 'Artist', + 'albumartist': 'Artist', + 'date': '2024-01-01' + } + + item = self._insert_matched_item( + task['task_id'], + source_path, + recording_id=None, + release_id=None, + title='Song (Live)', + duration_seconds=201 + ) + stats = create_empty_task_stats() + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['dedupe_status'], 'unique') + self.assertEqual(stats['dedupe']['library_duplicates'], 0) + + def test_marks_item_failed_when_duplicate_source_file_is_missing(self): + task = self._create_task() + source_path = self._write_source('Artist/Album/01.flac') + library_path = self._write_library('A/Artist/Album/01 - Song.flac') + self.library_audio_props[str(library_path)] = { + 'codec': 'FLAC', + 'bit_depth': 16, + 'sample_rate': 44100, + 'bitrate': 700000, + 'channels': 2, + 'duration_seconds': 201 + } + self.library_tags[str(library_path)] = { + 'title': 'Song', + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1', + 'discnumber': '1', + 'musicbrainzrecordingid': 'recording-1' + } + + item = self._insert_matched_item(task['task_id'], source_path, recording_id='recording-1') + Path(source_path).unlink() + stats = create_empty_task_stats() + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['dedupe_status'], 'failed') + self.assertEqual(item['dedupe_reason'], 'source_missing') + self.assertEqual(stats['dedupe']['failed_items'], 1) + + def _create_task(self, replace: bool = False) -> dict: + return self.task_store.create_task_if_idle(self._config(replace=replace)) + + def _config(self, replace: bool = False) -> dict: + return { + 'input': str(self.input_dir), + 'output': str(self.output_dir), + 'trash': str(self.trash_dir), + 'advancedStrategy': { + 'replaceLowQualityDuplicates': replace + } + } + + def _write_source(self, relative_path: str) -> str: + path = self.input_dir / relative_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b'audio') + return str(path.resolve(strict=False)) + + def _write_library(self, relative_path: str) -> Path: + path = self.output_dir / relative_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b'library-audio') + return path + + def _insert_matched_item( + self, + task_id: str, + source_path: str, + *, + recording_id: str | None = 'recording-1', + release_id: str | None = 'release-1', + title: str = 'Song', + duration_seconds: int = 201, + confidence: float = 92.0, + audio_props: dict | None = None + ) -> dict: + path = Path(source_path) + return self.task_store.insert_task_item( + task_id, + original_path=source_path, + current_file_path=source_path, + relative_path=path.relative_to(self.input_dir).as_posix(), + filename=path.name, + extension=path.suffix.lower(), + size_bytes=path.stat().st_size, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='matched', + match_reason='authoritative_auto_match', + match_message='matched', + match_source='musicbrainz', + match_confidence=confidence, + match_is_authoritative=1, + audio_props_json=audio_props or { + 'codec': 'FLAC', + 'bit_depth': 16, + 'sample_rate': 44100, + 'bitrate': 700000, + 'channels': 2, + 'duration_seconds': duration_seconds + }, + matched_metadata_json={ + 'title': title, + 'artist': 'Artist', + 'artists': ['Artist'], + 'album': 'Album', + 'album_artist': 'Artist', + 'track_number': 1, + 'disc_number': 1, + 'release_date': '2024-01-01', + 'year': 2024, + 'duration_seconds': duration_seconds, + 'recording_id': recording_id, + 'release_id': release_id, + 'release_group_id': 'group-1', + 'source_ids': {'musicbrainz_recording_id': recording_id} if recording_id else {} + } + ) + + +class OrganizeRunnerTests(unittest.TestCase): + def setUp(self): + self.root = Path(tempfile.mkdtemp()) + self.input_dir = self.root / 'input' + self.output_dir = self.root / 'output' + self.trash_dir = self.root / 'trash' + self.input_dir.mkdir() + self.output_dir.mkdir() + self.trash_dir.mkdir() + self.task_store = TaskStore(self.root / 'music_workshop.db') + self.runner = OrganizeRunner(self.task_store, TaskStreamManager()) + + def test_builds_single_disc_album_path(self): + task = self._create_task() + item = self._insert_organize_item(task['task_id'], 'Artist/Album/source.flac') + stats = create_empty_task_stats() + + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['organize_status'], 'organized') + self.assertEqual(item['library_relative_path'], 'A/Artist/Album/01 - Song.flac') + self.assertTrue(Path(item['library_file_path']).exists()) + log_types = { + log['event_type'] + for log in self.task_store.list_task_logs(task['task_id'], 1, 50)['logs'] + } + self.assertIn('organize.path_planned', log_types) + self.assertIn('organize.item_organized', log_types) + + def test_places_multi_disc_release_under_disc_folder(self): + task = self._create_task() + self._insert_organize_item(task['task_id'], 'Artist/Album/source.flac', disc_number=2, track_number=7) + stats = create_empty_task_stats() + + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['library_relative_path'], 'A/Artist/Album/Disc 2/07 - Song.flac') + + def test_places_missing_album_track_under_singles(self): + task = self._create_task() + self._insert_organize_item(task['task_id'], 'Artist/source.flac', album=None, title='Loose Song', year=2023) + stats = create_empty_task_stats() + + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['library_relative_path'], 'A/Artist/Singles/2023 - Loose Song/01 - Loose Song.flac') + + def test_places_non_ascii_album_artist_under_hash_bucket(self): + task = self._create_task() + self._insert_organize_item(task['task_id'], 'Artist/source.flac', album_artist='周杰伦') + stats = create_empty_task_stats() + + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertTrue(item['library_relative_path'].startswith('#/周杰伦/Album/')) + + def test_resolves_target_collisions_with_suffix(self): + task = self._create_task() + target = self.output_dir / 'A' / 'Artist' / 'Album' / '01 - Song.flac' + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(b'existing') + self._insert_organize_item(task['task_id'], 'Artist/Album/source.flac') + stats = create_empty_task_stats() + + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['library_relative_path'], 'A/Artist/Album/01 - Song (2).flac') + self.assertEqual(stats['organize']['collision_resolved'], 1) + + def test_moves_failed_item_to_organize_trash(self): + task = self._create_task() + self._insert_organize_item(task['task_id'], 'Artist/Album/source.flac') + stats = create_empty_task_stats() + output_root = self.output_dir.resolve(strict=False) + original_move = self.runner._move_file + + def failing_move(source: Path, destination: Path): + if output_root in destination.resolve(strict=False).parents: + raise OSError('blocked') + return original_move(source, destination) + + self.runner._move_file = failing_move + + self.runner.run(task['task_id'], stats, self._config()) + + item = self.task_store.list_task_items(task['task_id'], None, 1, 10)['items'][0] + self.assertEqual(item['organize_status'], 'trashed') + self.assertTrue(Path(item['trash_file_path']).exists()) + self.assertEqual(stats['organize']['failed_items'], 1) + self.assertEqual(stats['organize']['trashed_items'], 1) + + def _create_task(self) -> dict: + return self.task_store.create_task_if_idle(self._config()) + + def _config(self) -> dict: + return { + 'input': str(self.input_dir), + 'output': str(self.output_dir), + 'trash': str(self.trash_dir) + } + + def _insert_organize_item( + self, + task_id: str, + relative_path: str, + *, + title: str = 'Song', + album: str | None = 'Album', + album_artist: str = 'Artist', + track_number: int = 1, + disc_number: int = 1, + year: int = 2024 + ) -> dict: + path = self.input_dir / relative_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b'audio') + return self.task_store.insert_task_item( + task_id, + original_path=str(path.resolve(strict=False)), + current_file_path=str(path.resolve(strict=False)), + relative_path=relative_path, + filename=path.name, + extension=path.suffix.lower(), + size_bytes=path.stat().st_size, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='matched', + match_reason='authoritative_auto_match', + match_message='matched', + dedupe_status='unique', + organize_status='pending', + matched_metadata_json={ + 'title': title, + 'artist': album_artist, + 'artists': [album_artist], + 'album': album, + 'album_artist': album_artist, + 'track_number': track_number, + 'disc_number': disc_number, + 'release_date': f'{year}-01-01', + 'year': year, + 'duration_seconds': 201, + 'recording_id': 'recording-1', + 'release_id': 'release-1', + 'release_group_id': 'group-1', + 'source_ids': {'musicbrainz_recording_id': 'recording-1'} + } + ) + + +class _NoopPreprocessor: + def probe_audio(self, _file_path: str) -> dict: + return {} + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_library_service.py b/backend/tests/test_library_service.py new file mode 100644 index 0000000..45ba054 --- /dev/null +++ b/backend/tests/test_library_service.py @@ -0,0 +1,385 @@ +import shutil +import tempfile +import unittest +from datetime import datetime, timezone +import os +from pathlib import Path + +from backend.app.exception_service import ExceptionService +from backend.app.library_service import LibraryService, LibraryTrackNotFoundError +from backend.app.task_store import TaskStore + + +class LibraryServiceTests(unittest.TestCase): + def setUp(self): + self.root = Path(tempfile.mkdtemp()) + self.output_dir = self.root / 'output' + self.output_dir.mkdir() + self.task_store = TaskStore(self.root / 'music_workshop.db') + self.preprocessor = _FakePreprocessor() + self.service = LibraryService( + self.task_store, + self.preprocessor, + read_tags=self.preprocessor.read_tags + ) + + def tearDown(self): + shutil.rmtree(self.root) + + def test_empty_output_dir_returns_empty_summary_and_tracks(self): + summary = self.service.get_summary(str(self.output_dir)) + page = self.service.get_tracks_page(str(self.output_dir)) + + self.assertEqual(summary['total_tracks'], 0) + self.assertEqual(summary['total_albums'], 0) + self.assertEqual(summary['total_artists'], 0) + self.assertEqual(summary['suspected_duplicates'], 0) + self.assertEqual(page['items'], []) + self.assertEqual(page['total'], 0) + + def test_scans_metadata_audio_and_filters_tracks(self): + first_path = self._write_library_file('A/Artist A/Album A/01 - Echoes.flac', _timestamp(2024, 1, 1)) + second_path = self._write_library_file('B/Artist B/Album B/03 - Neon.mp3', _timestamp(2024, 1, 2)) + self.preprocessor.audio_props[str(first_path)] = { + 'format': 'FLAC', + 'codec': 'FLAC', + 'bitrate': 980000, + 'sample_rate': 96000, + 'bit_depth': 24, + 'channels': 2, + 'duration_seconds': 301.4 + } + self.preprocessor.tags[str(first_path)] = { + 'title': 'Echoes', + 'artist': 'Artist A', + 'album': 'Album A', + 'albumartist': 'Artist A', + 'tracknumber': '1', + 'discnumber': '1', + 'date': '2024-01-01' + } + self.preprocessor.audio_props[str(second_path)] = { + 'format': 'MP3', + 'codec': 'MP3', + 'bitrate': 320000, + 'sample_rate': 44100, + 'bit_depth': 16, + 'channels': 2, + 'duration_seconds': 240.1 + } + self.preprocessor.tags[str(second_path)] = { + 'title': 'Neon', + 'artist': 'Artist B', + 'album': 'Album B', + 'albumartist': 'Artist B', + 'tracknumber': '3', + 'discnumber': '1', + 'date': '2023-12-01' + } + task = self._create_completed_task() + self._insert_provenance_item( + task['task_id'], + library_file_path=str(first_path), + library_relative_path='A/Artist A/Album A/01 - Echoes.flac', + updated_at='2024-01-03T09:00:00Z', + match_source='musicbrainz', + match_confidence=94.5, + dedupe_status='unique' + ) + + page = self.service.get_tracks_page( + str(self.output_dir), + q='echo', + artist='Artist A', + album='Album A', + format='flac', + has_provenance=True + ) + + self.assertEqual(page['total'], 1) + track = page['items'][0] + self.assertEqual(track['filename'], '01 - Echoes.flac') + self.assertEqual(track['title'], 'Echoes') + self.assertEqual(track['artist'], 'Artist A') + self.assertEqual(track['album'], 'Album A') + self.assertEqual(track['format'], 'FLAC') + self.assertEqual(track['codec'], 'FLAC') + self.assertEqual(track['bit_depth'], 24) + self.assertEqual(track['sample_rate'], 96000) + self.assertEqual(track['ingest_provenance']['task_id'], task['task_id']) + self.assertEqual(track['ingest_provenance']['match_source'], 'musicbrainz') + self.assertEqual(track['ingest_provenance']['dedupe_status'], 'unique') + + def test_default_sort_prefers_organized_at_and_falls_back_to_modified_at(self): + newest_path = self._write_library_file('N/Newest/Album/01 - Fresh.flac', _timestamp(2024, 1, 3)) + organized_path = self._write_library_file('O/Organized/Album/01 - Sorted.flac', _timestamp(2024, 1, 1)) + oldest_path = self._write_library_file('Z/Oldest/Album/01 - Archive.flac', _timestamp(2023, 12, 31)) + + for path, title, artist in ( + (newest_path, 'Fresh', 'Newest'), + (organized_path, 'Sorted', 'Organized'), + (oldest_path, 'Archive', 'Oldest') + ): + self.preprocessor.audio_props[str(path)] = {'format': 'FLAC', 'codec': 'FLAC'} + self.preprocessor.tags[str(path)] = { + 'title': title, + 'artist': artist, + 'album': 'Album', + 'albumartist': artist, + 'tracknumber': '1', + 'discnumber': '1' + } + + task = self._create_completed_task() + self._insert_provenance_item( + task['task_id'], + library_file_path=str(organized_path), + library_relative_path='O/Organized/Album/01 - Sorted.flac', + updated_at='2024-01-02T12:00:00Z' + ) + + page = self.service.get_tracks_page(str(self.output_dir)) + ordered_titles = [item['title'] for item in page['items']] + + self.assertEqual(ordered_titles, ['Fresh', 'Sorted', 'Archive']) + + def test_provenance_prefers_absolute_path_then_falls_back_to_relative_path(self): + exact_path = self._write_library_file('A/Artist/Album/01 - Exact.flac', _timestamp(2024, 1, 1)) + fallback_path = self._write_library_file('B/Artist/Album/02 - Fallback.flac', _timestamp(2024, 1, 1)) + + for path, title in ((exact_path, 'Exact'), (fallback_path, 'Fallback')): + self.preprocessor.audio_props[str(path)] = {'format': 'FLAC', 'codec': 'FLAC'} + self.preprocessor.tags[str(path)] = { + 'title': title, + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1', + 'discnumber': '1' + } + + old_task = self._create_completed_task() + self._insert_provenance_item( + old_task['task_id'], + library_file_path=str(exact_path), + library_relative_path='A/Artist/Album/01 - Exact.flac', + updated_at='2024-01-01T08:00:00Z', + match_source='exact-source' + ) + newer_task = self._create_completed_task() + self._insert_provenance_item( + newer_task['task_id'], + library_file_path='/legacy/output/A/Artist/Album/01 - Exact.flac', + library_relative_path='A/Artist/Album/01 - Exact.flac', + updated_at='2024-01-03T08:00:00Z', + match_source='relative-source' + ) + fallback_task = self._create_completed_task() + self._insert_provenance_item( + fallback_task['task_id'], + library_file_path='/legacy/output/B/Artist/Album/02 - Fallback.flac', + library_relative_path='B/Artist/Album/02 - Fallback.flac', + updated_at='2024-01-04T08:00:00Z', + match_source='fallback-source' + ) + + page = self.service.get_tracks_page(str(self.output_dir), sort_by='filename', sort_order='asc') + exact_track = next(item for item in page['items'] if item['title'] == 'Exact') + fallback_track = next(item for item in page['items'] if item['title'] == 'Fallback') + + self.assertEqual(exact_track['ingest_provenance']['match_source'], 'exact-source') + self.assertEqual(fallback_track['ingest_provenance']['match_source'], 'fallback-source') + + def test_summary_counts_suspected_duplicates_without_false_live_match(self): + duplicate_one = self._write_library_file('A/Artist/Album/01 - Song.flac', _timestamp(2024, 1, 1)) + duplicate_two = self._write_library_file('A/Artist/Album/01 - Song Copy.flac', _timestamp(2024, 1, 2)) + studio = self._write_library_file('S/Artist/Singles/2024 - Ballad/01 - Ballad.flac', _timestamp(2024, 1, 3)) + live = self._write_library_file('S/Artist/Singles/2024 - Ballad Live/01 - Ballad Live.flac', _timestamp(2024, 1, 4)) + + for path in (duplicate_one, duplicate_two, studio, live): + self.preprocessor.audio_props[str(path)] = { + 'format': 'FLAC', + 'codec': 'FLAC', + 'duration_seconds': 201 + } + + self.preprocessor.tags[str(duplicate_one)] = { + 'title': 'Song', + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1', + 'discnumber': '1', + 'musicbrainzrecordingid': 'recording-1' + } + self.preprocessor.tags[str(duplicate_two)] = { + 'title': 'Song', + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1', + 'discnumber': '1', + 'musicbrainzrecordingid': 'recording-1' + } + self.preprocessor.tags[str(studio)] = { + 'title': 'Ballad', + 'artist': 'Artist', + 'albumartist': 'Artist' + } + self.preprocessor.tags[str(live)] = { + 'title': 'Ballad (Live)', + 'artist': 'Artist', + 'albumartist': 'Artist' + } + + summary = self.service.get_summary(str(self.output_dir)) + + self.assertEqual(summary['total_tracks'], 4) + self.assertEqual(summary['suspected_duplicates'], 1) + + def test_move_track_to_exception_moves_file_and_creates_match_failed_item(self): + trash_dir = self.root / 'trash' + library_path = self._write_library_file('A/Artist/Album/01 - Song.flac', _timestamp(2024, 1, 5)) + self.preprocessor.audio_props[str(library_path)] = { + 'format': 'FLAC', + 'codec': 'FLAC', + 'duration_seconds': 180.25 + } + self.preprocessor.tags[str(library_path)] = { + 'title': 'Song', + 'artist': 'Artist', + 'album': 'Album', + 'albumartist': 'Artist', + 'tracknumber': '1' + } + self.preprocessor.fingerprints[str(library_path)] = { + 'fingerprint': 'abc123', + 'duration_seconds': 180.0 + } + track = self.service.get_tracks_page(str(self.output_dir))['items'][0] + + response = self.service.move_track_to_exception( + { + 'input': str(self.root / 'input'), + 'output': str(self.output_dir), + 'trash': str(trash_dir) + }, + track['track_id'] + ) + + trash_path = Path(response['trash_file_path']) + self.assertFalse(library_path.exists()) + self.assertTrue(trash_path.exists()) + self.assertEqual(response['library_relative_path'], 'A/Artist/Album/01 - Song.flac') + self.assertEqual(self.service.get_tracks_page(str(self.output_dir))['total'], 0) + + exception_service = ExceptionService(self.task_store) + page = exception_service.get_items('match_failed') + self.assertEqual(page['total'], 1) + exception_item = page['items'][0] + self.assertEqual(exception_item['exception_id'], response['exception_id']) + self.assertEqual(exception_item['exception_type'], 'match_failed') + self.assertEqual(exception_item['exception_reason_code'], 'manual_library_requeue') + self.assertEqual(exception_item['trash_file_path'], str(trash_path)) + self.assertEqual(exception_item['library_file_path'], str(library_path)) + self.assertIn('retry_match', exception_item['available_actions']) + + source_item = self.task_store.get_exception_source_item(response['exception_id']) + self.assertEqual(source_item['current_file_path'], str(trash_path)) + self.assertEqual(source_item['original_tags_json']['title'], 'Song') + self.assertEqual(source_item['audio_props_json']['codec'], 'FLAC') + self.assertEqual(source_item['acoustic_fingerprint'], 'abc123') + + def test_move_track_to_exception_rejects_unknown_track_id(self): + with self.assertRaises(LibraryTrackNotFoundError): + self.service.move_track_to_exception( + { + 'input': str(self.root / 'input'), + 'output': str(self.output_dir), + 'trash': str(self.root / 'trash') + }, + 'unknown-track-id' + ) + + def _write_library_file(self, relative_path: str, modified_at_timestamp: int) -> Path: + path = self.output_dir / relative_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(b'audio-data') + os.utime(path, (float(modified_at_timestamp), float(modified_at_timestamp))) + return path + + def _create_completed_task(self) -> dict: + task = self.task_store.create_task_if_idle( + {'input': '', 'output': str(self.output_dir), 'trash': ''} + ) + self.task_store.update_task( + task['task_id'], + status='completed', + completed_at='2024-01-01T00:00:00Z' + ) + return task + + def _insert_provenance_item( + self, + task_id: str, + *, + library_file_path: str, + library_relative_path: str, + updated_at: str, + match_source: str | None = None, + match_confidence: float | None = None, + dedupe_status: str = 'unique' + ): + item = self.task_store.insert_task_item( + task_id, + original_path=library_file_path, + current_file_path=library_file_path, + relative_path=library_relative_path, + filename=Path(library_file_path).name, + extension=Path(library_file_path).suffix.lower(), + size_bytes=123, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='matched', + match_source=match_source, + match_confidence=match_confidence, + dedupe_status=dedupe_status, + organize_status='organized', + library_relative_path=library_relative_path, + library_file_path=library_file_path + ) + with self.task_store._connect() as connection: + connection.execute( + 'UPDATE task_items SET updated_at = ? WHERE id = ?', + (updated_at, item['id']) + ) + connection.commit() + + +class _FakePreprocessor: + def __init__(self): + self.audio_props: dict[str, dict] = {} + self.tags: dict[str, dict] = {} + self.fingerprints: dict[str, dict] = {} + + def probe_audio(self, file_path: str) -> dict: + return self.audio_props.get(file_path, {}) + + def read_tags(self, file_path: str) -> dict: + return self.tags.get(file_path, {}) + + def calculate_fingerprint(self, file_path: str) -> dict: + return self.fingerprints.get(file_path, {}) + + +def _timestamp(year: int, month: int, day: int) -> int: + return int(datetime(year, month, day, tzinfo=timezone.utc).timestamp()) + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_match_providers.py b/backend/tests/test_match_providers.py new file mode 100644 index 0000000..7a929a7 --- /dev/null +++ b/backend/tests/test_match_providers.py @@ -0,0 +1,186 @@ +import json +import time +import unittest +from unittest.mock import patch +from urllib import error + +from backend.app.matcher import MatchHttpClient, MusicBrainzProvider, SpotifyProvider + + +class MatchProviderTests(unittest.TestCase): + def test_match_http_client_retries_url_errors(self): + client = MatchHttpClient() + + with patch( + 'backend.app.matcher.request.urlopen', + side_effect=[ + error.URLError('temporary dns error'), + FakeResponse({'ok': True}) + ] + ) as mock_urlopen: + payload = client.request_json('test', 'https://example.com', retries=1) + + self.assertEqual(payload['ok'], True) + self.assertEqual(mock_urlopen.call_count, 2) + + def test_match_http_client_retries_timeout(self): + client = MatchHttpClient() + + with patch( + 'backend.app.matcher.request.urlopen', + side_effect=[ + TimeoutError('timeout'), + FakeResponse({'ok': True}) + ] + ) as mock_urlopen: + payload = client.request_json('test', 'https://example.com', retries=1) + + self.assertEqual(payload['ok'], True) + self.assertEqual(mock_urlopen.call_count, 2) + + def test_musicbrainz_requests_use_user_agent_and_throttle(self): + client = MatchHttpClient() + provider = MusicBrainzProvider(client) + observed_headers = [] + + def fake_urlopen(req, timeout): + observed_headers.append(dict(req.header_items())) + return FakeResponse({'recordings': []}) + + with patch('backend.app.matcher.request.urlopen', side_effect=fake_urlopen) as mock_urlopen: + with patch('backend.app.matcher.time.sleep') as mock_sleep: + with patch( + 'backend.app.matcher.time.monotonic', + side_effect=[0.0, 0.0, 0.1, 0.1] + ): + provider._request_json( + 'musicbrainz', + 'https://musicbrainz.org/ws/2/recording', + params={'fmt': 'json'} + ) + provider._request_json( + 'musicbrainz', + 'https://musicbrainz.org/ws/2/recording', + params={'fmt': 'json'} + ) + + self.assertEqual(mock_urlopen.call_count, 2) + self.assertTrue(any('User-agent' in headers or 'User-Agent' in headers for headers in observed_headers)) + self.assertTrue(mock_sleep.called) + + def test_spotify_provider_refreshes_expired_token(self): + provider = SpotifyProvider(MatchHttpClient()) + config = { + 'metadata': { + 'spotifyUrl': 'https://api.spotify.com/v1', + 'spotifyClientId': 'spotify-id', + 'spotifySecret': 'spotify-secret' + } + } + observed_authorization = [] + token_counter = {'value': 0} + + def fake_urlopen(req, timeout): + url = req.full_url + if 'api/token' in url: + token_counter['value'] += 1 + return FakeResponse( + { + 'access_token': f'token-{token_counter["value"]}', + 'expires_in': 3600 + } + ) + + observed_authorization.append(req.headers.get('Authorization')) + return FakeResponse( + { + 'tracks': { + 'items': [ + { + 'id': 'track-1', + 'name': 'Song Title', + 'artists': [{'name': 'Song Artist'}], + 'album': { + 'id': 'album-1', + 'name': 'Album Name', + 'release_date': '2024-01-01', + 'images': [] + }, + 'track_number': 1, + 'disc_number': 1, + 'duration_ms': 201000 + } + ] + } + } + ) + + with patch('backend.app.matcher.request.urlopen', side_effect=fake_urlopen): + provider.search( + { + 'title': 'Song Title', + 'artist': 'Song Artist', + 'album': 'Album Name' + }, + config + ) + cache_key = 'spotify-id:spotify-secret' + provider._token_cache[cache_key]['expires_at'] = time.time() - 1 + provider.search( + { + 'title': 'Song Title', + 'artist': 'Song Artist', + 'album': 'Album Name' + }, + config + ) + + self.assertEqual(token_counter['value'], 2) + self.assertEqual(observed_authorization, ['Bearer token-1', 'Bearer token-2']) + + def test_spotify_provider_skips_when_credentials_are_missing(self): + provider = SpotifyProvider(MatchHttpClient()) + config = { + 'metadata': { + 'spotifyUrl': 'https://api.spotify.com/v1', + 'spotifyClientId': '', + 'spotifySecret': '' + } + } + + with patch('backend.app.matcher.request.urlopen') as mock_urlopen: + candidates = provider.search( + { + 'title': 'Song Title', + 'artist': 'Song Artist', + 'album': 'Album Name' + }, + config + ) + + self.assertEqual(candidates, []) + self.assertEqual(mock_urlopen.call_count, 0) + + +class FakeHeaders: + def get_content_charset(self): + return 'utf-8' + + +class FakeResponse: + def __init__(self, payload): + self._payload = json.dumps(payload).encode('utf-8') + self.headers = FakeHeaders() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def read(self): + return self._payload + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_matcher.py b/backend/tests/test_matcher.py new file mode 100644 index 0000000..03e8c85 --- /dev/null +++ b/backend/tests/test_matcher.py @@ -0,0 +1,461 @@ +import copy +import unittest + +from backend.app.matcher import MatchProviderError, Matcher + + +DEFAULT_CONFIG = { + 'advancedStrategy': { + 'metadataFallback': True, + 'downloadAssets': False + }, + 'metadata': { + 'acoustidUrl': 'https://api.acoustid.org/v2', + 'acoustidClientKey': 'client-key', + 'musicbrainz': 'https://musicbrainz.org/ws/2/', + 'netease': 'http://localhost:3000', + 'qq': 'http://localhost:3300', + 'spotifyUrl': 'https://api.spotify.com/v1', + 'spotifyClientId': 'spotify-id', + 'spotifySecret': 'spotify-secret', + 'discogsUrl': 'https://api.discogs.com', + 'discogsToken': '', + 'lastfmUrl': 'https://ws.audioscrobbler.com/2.0/', + 'lastfmKey': '', + 'geniusUrl': 'https://api.genius.com', + 'geniusToken': '' + } +} + + +class MatcherTests(unittest.TestCase): + def test_matches_authoritative_acoustid_candidate(self): + item = build_item() + candidate = build_candidate( + provider='acoustid', + is_authoritative=True, + fingerprint_confidence=0.98 + ) + matcher = build_matcher( + acoustid_candidates=[candidate] + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'matched') + self.assertEqual(result['source'], 'acoustid') + self.assertTrue(result['is_authoritative']) + self.assertGreaterEqual(result['confidence'], 85) + + def test_matches_musicbrainz_text_candidate_without_fingerprint(self): + item = build_item() + candidate = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.92 + ) + matcher = build_matcher( + musicbrainz_candidates=[candidate] + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'matched') + self.assertEqual(result['source'], 'musicbrainz') + self.assertTrue(result['is_authoritative']) + self.assertEqual(result['matched_metadata_json']['release_id'], 'release-main') + + def test_matches_fallback_candidate_when_authoritative_missing(self): + item = build_item() + candidate = build_candidate( + provider='spotify', + is_authoritative=False, + search_confidence=0.9, + source_ids={ + 'spotify_track_id': 'track-1', + 'spotify_album_id': 'album-1' + } + ) + matcher = build_matcher( + spotify_candidates=[candidate] + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'matched_fallback') + self.assertEqual(result['source'], 'spotify') + self.assertFalse(result['is_authoritative']) + + def test_respects_repair_provider_scope(self): + item = build_item() + authoritative = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.95 + ) + fallback = build_candidate( + provider='spotify', + is_authoritative=False, + search_confidence=0.9, + source_ids={ + 'spotify_track_id': 'track-1', + 'spotify_album_id': 'album-1' + } + ) + matcher = build_matcher( + musicbrainz_candidates=[authoritative], + spotify_candidates=[fallback] + ) + scoped_config = copy.deepcopy(DEFAULT_CONFIG) + scoped_config['repair_provider_scope'] = ['spotify'] + + result = matcher.match_item(item, [item], scoped_config) + + self.assertEqual(result['status'], 'matched_fallback') + self.assertEqual(result['source'], 'spotify') + self.assertFalse(result['is_authoritative']) + + def test_returns_low_score_when_gap_is_too_small(self): + item = build_item() + first = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.95, + release_id='release-a' + ) + second = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.94, + release_id='release-b', + source_ids={ + 'musicbrainz_recording_id': 'recording-b', + 'musicbrainz_release_id': 'release-b', + 'musicbrainz_release_group_id': 'release-group-b' + } + ) + matcher = build_matcher( + musicbrainz_candidates=[first, second] + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'low_score') + self.assertEqual(result['reason'], 'score_gap_too_small') + self.assertEqual(len(result['match_candidates_json']), 2) + + def test_returns_not_found_when_no_candidates_exist(self): + item = build_item() + matcher = build_matcher() + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'not_found') + self.assertIsNone(result['matched_metadata_json']) + self.assertEqual(result['match_candidates_json'], []) + + def test_skips_acoustid_error_and_matches_musicbrainz_text_candidate(self): + item = build_item() + candidate = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.92 + ) + matcher = build_matcher( + acoustid_error=MatchProviderError('acoustid', 'acoustid failed'), + musicbrainz_candidates=[candidate] + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'matched') + self.assertEqual(result['source'], 'musicbrainz') + self.assertEqual(len(result['provider_warnings']), 1) + self.assertEqual(result['provider_warnings'][0]['provider'], 'acoustid') + + def test_skips_musicbrainz_error_and_uses_fallback_candidate(self): + item = build_item() + candidate = build_candidate( + provider='spotify', + is_authoritative=False, + search_confidence=0.9, + source_ids={ + 'spotify_track_id': 'track-1', + 'spotify_album_id': 'album-1' + } + ) + matcher = build_matcher( + musicbrainz_error=MatchProviderError('musicbrainz', 'musicbrainz failed'), + spotify_candidates=[candidate] + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'matched_fallback') + self.assertEqual(result['source'], 'spotify') + self.assertEqual( + [warning['provider'] for warning in result['provider_warnings']], + ['musicbrainz', 'musicbrainz'] + ) + + def test_returns_not_found_when_all_providers_fail(self): + item = build_item() + matcher = build_matcher( + acoustid_error=MatchProviderError('acoustid', 'acoustid failed'), + musicbrainz_error=MatchProviderError('musicbrainz', 'musicbrainz failed'), + netease_error=MatchProviderError('netease', 'netease failed'), + qq_error=MatchProviderError('qq', 'qq failed'), + spotify_error=MatchProviderError('spotify', 'spotify failed') + ) + + result = matcher.match_item(item, [item], DEFAULT_CONFIG) + + self.assertEqual(result['status'], 'not_found') + self.assertEqual( + [warning['provider'] for warning in result['provider_warnings']], + ['acoustid', 'musicbrainz', 'netease', 'qq', 'spotify'] + ) + + def test_album_context_converges_to_single_release(self): + item_one = build_item(title='Song Title', track_number=1, duration_seconds=201) + item_two = build_item( + title='Song Title', + track_number=2, + duration_seconds=233, + relative_path='Artist/Album/02.flac', + filename='02.flac' + ) + group = [item_one, item_two] + def dynamic_musicbrainz_candidates(item_metadata, _config, **_kwargs): + track_number = item_metadata.get('track_number') + duration_seconds = item_metadata.get('duration_seconds') + release_a = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.91, + track_number=track_number, + duration_seconds=duration_seconds, + recording_id=f'recording-a-{track_number}', + release_id='release-a', + release_group_id='group-a', + release_tracklist=[ + { + 'title': 'Song Title', + 'track_number': 1, + 'disc_number': 1, + 'duration_seconds': 201 + }, + { + 'title': 'Song Title', + 'track_number': 2, + 'disc_number': 1, + 'duration_seconds': 233 + } + ], + source_ids={ + 'musicbrainz_recording_id': f'recording-a-{track_number}', + 'musicbrainz_release_id': 'release-a', + 'musicbrainz_release_group_id': 'group-a' + } + ) + release_b = build_candidate( + provider='musicbrainz', + is_authoritative=True, + search_confidence=0.91, + track_number=track_number, + duration_seconds=duration_seconds, + recording_id=f'recording-b-{track_number}', + release_id='release-b', + release_group_id='group-b', + release_tracklist=[ + { + 'title': 'Track Zero', + 'track_number': 1, + 'disc_number': 1, + 'duration_seconds': 120 + }, + { + 'title': 'Track Extra', + 'track_number': 2, + 'disc_number': 1, + 'duration_seconds': 310 + } + ], + source_ids={ + 'musicbrainz_recording_id': f'recording-b-{track_number}', + 'musicbrainz_release_id': 'release-b', + 'musicbrainz_release_group_id': 'group-b' + } + ) + return [release_a, release_b] + + matcher = build_matcher( + musicbrainz_candidates=dynamic_musicbrainz_candidates + ) + + result_one = matcher.match_item(item_one, group, DEFAULT_CONFIG) + result_two = matcher.match_item(item_two, group, DEFAULT_CONFIG) + + self.assertEqual(result_one['status'], 'matched') + self.assertEqual(result_two['status'], 'matched') + self.assertEqual(result_one['matched_metadata_json']['release_id'], 'release-a') + self.assertEqual(result_two['matched_metadata_json']['release_id'], 'release-a') + + +def build_matcher( + *, + acoustid_candidates=None, + musicbrainz_candidates=None, + aligned_candidate=None, + netease_candidates=None, + qq_candidates=None, + spotify_candidates=None, + acoustid_error=None, + musicbrainz_error=None, + netease_error=None, + qq_error=None, + spotify_error=None +): + return Matcher( + acoustid_provider=StaticSearchProvider(acoustid_candidates, error=acoustid_error), + musicbrainz_provider=StaticMusicBrainzProvider( + musicbrainz_candidates, + aligned_candidate=aligned_candidate, + error=musicbrainz_error + ), + netease_provider=StaticSearchProvider(netease_candidates, error=netease_error), + qq_provider=StaticSearchProvider(qq_candidates, error=qq_error), + spotify_provider=StaticSearchProvider(spotify_candidates, error=spotify_error), + discogs_provider=StaticEnrichmentProvider(), + lastfm_provider=StaticEnrichmentProvider(), + genius_provider=StaticEnrichmentProvider() + ) + + +def build_item( + *, + title='Song Title', + artist='Song Artist', + album='Album Name', + track_number=1, + disc_number=1, + duration_seconds=201, + relative_path='Artist/Album/01.flac', + filename='01.flac' +): + return { + 'id': 1, + 'task_id': 'task-1', + 'original_path': f'/tmp/{filename}', + 'current_file_path': f'/tmp/{filename}', + 'relative_path': relative_path, + 'filename': filename, + 'original_tags_json': { + 'title': title, + 'artist': artist, + 'album': album, + 'album_artist': artist, + 'track_number': str(track_number), + 'disc_number': str(disc_number) + }, + 'audio_props_json': { + 'duration_seconds': duration_seconds + }, + 'acoustic_fingerprint': 'fingerprint', + 'fingerprint_duration_seconds': duration_seconds, + 'scan_status': 'queued', + 'preprocess_status': 'completed' + } + + +def build_candidate( + *, + provider, + is_authoritative, + title='Song Title', + artist='Song Artist', + album='Album Name', + track_number=1, + disc_number=1, + duration_seconds=201, + recording_id='recording-main', + release_id='release-main', + release_group_id='release-group-main', + fingerprint_confidence=None, + search_confidence=None, + release_tracklist=None, + source_ids=None +): + return { + 'provider': provider, + 'is_authoritative': is_authoritative, + 'title': title, + 'artist': artist, + 'artists': [artist], + 'album': album, + 'album_artist': artist, + 'track_number': track_number, + 'disc_number': disc_number, + 'release_date': '2024-01-01', + 'year': 2024, + 'duration_seconds': duration_seconds, + 'recording_id': recording_id, + 'release_id': release_id, + 'release_group_id': release_group_id, + 'source_ids': source_ids or { + 'musicbrainz_recording_id': recording_id, + 'musicbrainz_release_id': release_id, + 'musicbrainz_release_group_id': release_group_id + }, + 'fingerprint_confidence': fingerprint_confidence, + 'search_confidence': search_confidence, + 'release_tracklist': release_tracklist or [ + { + 'title': title, + 'track_number': track_number, + 'disc_number': disc_number, + 'duration_seconds': duration_seconds + } + ] + } + + +class StaticSearchProvider: + def __init__(self, candidates=None, *, error=None): + self.candidates = candidates or [] + self.error = error + + def search(self, *args, **kwargs): + if self.error: + raise self.error + if callable(self.candidates): + return copy.deepcopy(self.candidates(*args, **kwargs)) + return copy.deepcopy(self.candidates) + + +class StaticMusicBrainzProvider: + def __init__(self, candidates=None, *, aligned_candidate=None, error=None): + self.candidates = candidates or [] + self.aligned_candidate = aligned_candidate + self.error = error + + def search_text(self, *args, **kwargs): + if self.error: + raise self.error + if callable(self.candidates): + return copy.deepcopy(self.candidates(*args, **kwargs)) + return copy.deepcopy(self.candidates) + + def align_candidate(self, *args, **kwargs): + if self.error: + raise self.error + return copy.deepcopy(self.aligned_candidate) + + +class StaticEnrichmentProvider: + def enrich(self, *args, **kwargs): + return None + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_metadata_normalization.py b/backend/tests/test_metadata_normalization.py new file mode 100644 index 0000000..d3296a1 --- /dev/null +++ b/backend/tests/test_metadata_normalization.py @@ -0,0 +1,95 @@ +import os +import tempfile +import unittest +from pathlib import Path + +os.environ['MUSIC_WORKSHOP_DB_PATH'] = str( + Path(tempfile.gettempdir()) / f'music_workshop_metadata_normalization_{next(tempfile._get_candidate_names())}.db' +) + +from backend.app.metadata_normalization import MetadataNormalizationService, can_ingest_metadata, parse_artist_string +from backend.app.task_store import TaskStore + + +class MetadataNormalizationTests(unittest.TestCase): + def setUp(self): + self.db_path = Path(os.environ['MUSIC_WORKSHOP_DB_PATH']) + if self.db_path.exists(): + self.db_path.unlink() + self.task_store = TaskStore(self.db_path) + self.task = self.task_store.create_task_if_idle( + { + 'input': '/tmp/input', + 'output': '/tmp/output', + 'trash': '/tmp/trash' + } + ) + self.service = MetadataNormalizationService(self.task_store) + + def test_parse_artist_string_supports_common_delimiters(self): + self.assertEqual(parse_artist_string('A / B')['tokens'], ['A', 'B']) + self.assertEqual(parse_artist_string('A; B')['tokens'], ['A', 'B']) + self.assertEqual(parse_artist_string('A & B')['tokens'], ['A', 'B']) + self.assertEqual(parse_artist_string('A feat. B')['tokens'], ['A', 'B']) + self.assertEqual(parse_artist_string('A、B')['tokens'], ['A', 'B']) + + def test_single_artist_album_derives_album_artist(self): + item = self._insert_item('track-01.flac', {'title': 'Song 1', 'artist': 'Artist A', 'album': 'Album X'}) + self._insert_item('track-02.flac', {'title': 'Song 2', 'artist': 'Artist A', 'album': 'Album X'}) + + normalized = self.service.normalize_item(item) + self.assertEqual(normalized['album_artist'], 'Artist A') + self.assertEqual(normalized['normalization_strategy'], 'single_artist') + self.assertTrue(can_ingest_metadata({**normalized, 'title': 'Song 1'})) + + def test_feat_album_uses_dominant_primary_artist(self): + item = self._insert_item('track-01.flac', {'title': 'Song 1', 'artist': 'Artist A feat. Guest', 'album': 'Album X'}) + self._insert_item('track-02.flac', {'title': 'Song 2', 'artist': 'Artist A', 'album': 'Album X'}) + self._insert_item('track-03.flac', {'title': 'Song 3', 'artist': 'Artist A & Another', 'album': 'Album X'}) + + normalized = self.service.normalize_item(item) + self.assertEqual(normalized['album_artist'], 'Artist A') + self.assertEqual(normalized['normalization_strategy'], 'main_artist_feat') + + def test_compilation_album_sets_various_artists(self): + item = self._insert_item('track-01.flac', {'title': 'Song 1', 'artist': 'Artist A', 'album': 'Top Hits 2025'}) + self._insert_item('track-02.flac', {'title': 'Song 2', 'artist': 'Artist B', 'album': 'Top Hits 2025'}) + self._insert_item('track-03.flac', {'title': 'Song 3', 'artist': 'Artist C', 'album': 'Top Hits 2025'}) + + normalized = self.service.normalize_item(item) + self.assertEqual(normalized['album_artist'], 'Various Artists') + self.assertEqual(normalized['compilation'], 1) + self.assertEqual(normalized['normalization_strategy'], 'compilation') + + def test_existing_album_artist_is_preserved(self): + item = self._insert_item( + 'track-01.flac', + {'title': 'Song 1', 'artist': '阿信', 'album': 'Solo Album', 'album_artist': '五月天'} + ) + + normalized = self.service.normalize_item(item) + self.assertEqual(normalized['album_artist'], '五月天') + self.assertEqual(normalized['normalization_strategy'], 'source_preserved') + + def _insert_item(self, filename: str, matched_metadata_json: dict): + return self.task_store.insert_task_item( + self.task['task_id'], + original_path=f'/tmp/input/{filename}', + current_file_path=f'/tmp/input/{filename}', + relative_path=f'Artist/Album/{filename}', + filename=filename, + extension='.flac', + size_bytes=123456, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + match_status='matched_fallback', + matched_metadata_json=matched_metadata_json + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_scanner.py b/backend/tests/test_scanner.py new file mode 100644 index 0000000..9b70b2c --- /dev/null +++ b/backend/tests/test_scanner.py @@ -0,0 +1,75 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from backend.app.scanner import probe_local_assets + + +class ScannerAssetProbeTests(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.library_dir = Path(self.temp_dir.name) + + def tearDown(self): + self.temp_dir.cleanup() + + def test_probe_local_assets_supports_cover_jpeg(self): + audio_path = self._create_file('album/track.flac') + cover_path = self._create_file('album/cover.jpeg') + + assets = probe_local_assets(audio_path) + + self.assertEqual(assets['local_cover'], str(cover_path.resolve())) + self.assertIsNone(assets['local_lyric']) + + def test_probe_local_assets_supports_folder_webp(self): + audio_path = self._create_file('album/track.flac') + cover_path = self._create_file('album/folder.webp') + + assets = probe_local_assets(audio_path) + + self.assertEqual(assets['local_cover'], str(cover_path.resolve())) + + def test_probe_local_assets_matches_track_name_case_insensitively(self): + audio_path = self._create_file('album/Track.FLAC') + cover_path = self._create_file('album/track.JPEG') + lyric_path = self._create_file('album/TRACK.LRC', '[00:00.00] lyric') + + assets = probe_local_assets(audio_path) + + self.assertEqual(assets['local_cover'], str(cover_path.resolve())) + self.assertEqual(assets['local_lyric'], str(lyric_path.resolve())) + + def test_probe_local_assets_respects_cover_priority(self): + audio_path = self._create_file('album/song.flac') + self._create_file('album/song.jpg') + self._create_file('album/folder.jpg') + preferred_cover = self._create_file('album/cover.png') + self._create_file('album/cover.webp') + + assets = probe_local_assets(audio_path) + + self.assertEqual(assets['local_cover'], str(preferred_cover.resolve())) + + def test_probe_local_assets_ignores_symlink_covers(self): + audio_path = self._create_file('album/song.flac') + fallback_cover = self._create_file('album/folder.webp') + target_path = self._create_file('targets/real-cover.jpeg') + symlink_path = self.library_dir / 'album' / 'cover.jpeg' + symlink_path.parent.mkdir(parents=True, exist_ok=True) + os.symlink(target_path, symlink_path) + + assets = probe_local_assets(audio_path) + + self.assertEqual(assets['local_cover'], str(fallback_cover.resolve())) + + def _create_file(self, relative_path: str, content: str = '') -> Path: + path = self.library_dir / relative_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding='utf-8') + return path + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_scanner_task_store.py b/backend/tests/test_scanner_task_store.py new file mode 100644 index 0000000..1af8f8c --- /dev/null +++ b/backend/tests/test_scanner_task_store.py @@ -0,0 +1,122 @@ +import os +import tempfile +import time +import unittest +from pathlib import Path + +from backend.app.scanner import Scanner +from backend.app.task_store import TaskConflictError, TaskStore + + +class ScannerTaskStoreTests(unittest.TestCase): + def test_scanner_queues_stable_audio_and_binds_local_assets(self): + items = [] + root = Path(tempfile.mkdtemp()) + album_dir = root / 'Artist' / 'Album' + album_dir.mkdir(parents=True) + audio_file = album_dir / '01.flac' + cover_file = album_dir / 'cover.jpeg' + lyric_file = album_dir / '01.lrc' + audio_file.write_bytes(b'abc') + cover_file.write_bytes(b'img') + lyric_file.write_text('[00:00.00] lyric', encoding='utf-8') + + stable_timestamp = time.time() - 120 + os.utime(audio_file, (stable_timestamp, stable_timestamp)) + + stats = Scanner().scan(str(root), on_item=lambda item: items.append(item.to_dict())) + + self.assertEqual(stats['queued'], 1) + self.assertEqual(items[0]['scan_status'], 'queued') + self.assertTrue(items[0]['local_cover'].endswith('cover.jpeg')) + self.assertTrue(items[0]['local_lyric'].endswith('01.lrc')) + + def test_scanner_skips_recently_modified_audio(self): + items = [] + root = Path(tempfile.mkdtemp()) + album_dir = root / 'Artist' + album_dir.mkdir(parents=True) + audio_file = album_dir / 'new.flac' + audio_file.write_bytes(b'abc') + + stats = Scanner().scan(str(root), on_item=lambda item: items.append(item.to_dict())) + + self.assertEqual(stats['skipped_locked'], 1) + self.assertEqual(items[0]['scan_status'], 'skipped_locked') + self.assertEqual(items[0]['scan_reason'], 'recent_mtime') + + def test_task_store_repairs_stale_tasks_on_startup(self): + db_path = Path(tempfile.mkdtemp()) / 'scanner_tasks.db' + store = TaskStore(db_path) + task = store.create_task_if_idle( + { + 'input': '/tmp/input', + 'output': '/tmp/output', + 'trash': '/tmp/trash' + } + ) + store.update_task( + task['task_id'], + status='running', + current_stage='scan', + stage_states={ + 'scan': 'running', + 'preprocess': 'pending', + 'match': 'pending', + 'dedupe': 'pending', + 'organize': 'pending', + 'complete': 'pending' + }, + stats={ + 'scan': { + 'total_found': 10, + 'queued': 8, + 'skipped_locked': 1, + 'skipped_invalid': 1, + 'ignored_non_audio': 5 + }, + 'preprocess': { + 'input_items': 0, + 'output_items': 0, + 'split_parents': 0, + 'generated_children': 0, + 'converted_items': 0, + 'metadata_snapshots': 0, + 'fingerprints_ok': 0, + 'fingerprints_failed': 0, + 'failed_items': 0, + 'warning_items': 0 + } + } + ) + + repaired_task_ids = store.fail_stale_active_tasks() + repaired_task = store.get_task(task['task_id']) + + self.assertIn(task['task_id'], repaired_task_ids) + self.assertEqual(repaired_task['status'], 'failed') + self.assertEqual(repaired_task['error_message'], 'Service restarted unexpectedly') + + def test_task_store_isolates_active_ingest_and_repair_tasks(self): + db_path = Path(tempfile.mkdtemp()) / 'scanner_tasks_repair.db' + store = TaskStore(db_path) + ingest = store.create_task_if_idle( + {'input': '/tmp/input', 'output': '/tmp/output', 'trash': '/tmp/trash'} + ) + repair = store.create_task_if_idle( + {'input': '/tmp/input', 'output': '/tmp/output', 'trash': '/tmp/trash'}, + task_type='repair' + ) + + self.assertEqual(store.get_active_task()['task_id'], ingest['task_id']) + self.assertEqual(store.get_active_task('repair')['task_id'], repair['task_id']) + + with self.assertRaises(TaskConflictError): + store.create_task_if_idle( + {'input': '/tmp/input', 'output': '/tmp/output', 'trash': '/tmp/trash'}, + task_type='repair' + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_task_api.py b/backend/tests/test_task_api.py new file mode 100644 index 0000000..cf1062d --- /dev/null +++ b/backend/tests/test_task_api.py @@ -0,0 +1,394 @@ +import os +import tempfile +import unittest +from pathlib import Path + +os.environ['MUSIC_WORKSHOP_DB_PATH'] = str( + Path(tempfile.gettempdir()) / f'music_workshop_task_api_{next(tempfile._get_candidate_names())}.db' +) + +from backend.app.task_store import TaskStore + +try: + from backend.app.schemas import TaskHistoryListResponse + import backend.app.main as main_module +except ModuleNotFoundError as error: + main_module = None + TaskHistoryListResponse = None + FASTAPI_IMPORT_ERROR = error +else: + FASTAPI_IMPORT_ERROR = None + + +class TaskStoreTests(unittest.TestCase): + def setUp(self): + self.db_path = Path(os.environ['MUSIC_WORKSHOP_DB_PATH']) + if self.db_path.exists(): + self.db_path.unlink() + self.task_store = TaskStore(self.db_path) + self._item_index = 0 + + def test_get_task_items_filters_new_status_fields_and_serializes_postprocess_fields(self): + task = self.task_store.create_task_if_idle( + { + 'input': '/tmp/input', + 'output': '/tmp/output', + 'trash': '/tmp/trash' + } + ) + self.task_store.insert_task_item( + task['task_id'], + original_path='/tmp/source-1.flac', + current_file_path='/tmp/source-1.flac', + relative_path='Artist/Album/01.flac', + filename='01.flac', + extension='.flac', + size_bytes=123, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='matched', + match_reason='authoritative_auto_match', + match_message='matched', + match_source='musicbrainz', + match_confidence=92.5, + match_is_authoritative=1, + matched_metadata_json={ + 'title': 'Song Title', + 'artist': 'Song Artist', + 'artists': ['Song Artist'], + 'album': 'Album Name', + 'album_artist': 'Song Artist', + 'track_number': 1, + 'disc_number': 1, + 'release_date': '2024-01-01', + 'year': 2024, + 'duration_seconds': 201, + 'recording_id': 'recording-1', + 'release_id': 'release-1', + 'release_group_id': 'group-1', + 'source_ids': {'musicbrainz_recording_id': 'recording-1'} + }, + match_candidates_json=[ + { + 'provider': 'musicbrainz', + 'score': 92.5, + 'score_breakdown': {'title': 20}, + 'is_authoritative': True, + 'recording_id': 'recording-1', + 'release_id': 'release-1', + 'release_group_id': 'group-1', + 'source_ids': {'musicbrainz_recording_id': 'recording-1'} + } + ], + match_enrichment_json={ + 'cover': {'selected_source': None, 'candidates': []}, + 'lyrics': {'selected_source': None, 'candidates': []}, + 'genres': {'selected_source': None, 'candidates': []}, + 'tags': {'selected_source': None, 'candidates': []} + }, + dedupe_status='unique', + dedupe_reason=None, + dedupe_message='kept', + dedupe_group_key='recording-1', + dedupe_decision_json={ + 'comparison_scope': 'library', + 'identity_basis': 'recording_id', + 'kept_side': 'batch' + }, + organize_status='organized', + organize_reason=None, + organize_message='organized', + library_relative_path='S/Song Artist/Album Name/01 - Song Title.flac', + library_file_path='/tmp/output/S/Song Artist/Album Name/01 - Song Title.flac', + trash_file_path=None, + organize_decision_json={ + 'source_path': '/tmp/source-1.flac', + 'final_relative_path': 'S/Song Artist/Album Name/01 - Song Title.flac', + 'final_action': 'organized' + } + ) + self.task_store.insert_task_item( + task['task_id'], + original_path='/tmp/source-2.flac', + current_file_path='/tmp/source-2.flac', + relative_path='Artist/Album/02.flac', + filename='02.flac', + extension='.flac', + size_bytes=123, + modified_at='2024-01-01T00:00:00Z', + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='low_score', + match_reason='score_gap_too_small', + match_message='low score' + ) + + response = self.task_store.list_task_items( + task['task_id'], + scan_status=None, + preprocess_status=None, + match_status='matched', + dedupe_status='unique', + organize_status='organized', + page=1, + page_size=10, + active_only=False + ) + + self.assertEqual(response['total'], 1) + self.assertEqual(response['items'][0]['match_status'], 'matched') + self.assertEqual(response['items'][0]['match_source'], 'musicbrainz') + self.assertEqual(response['items'][0]['matched_metadata_json']['release_id'], 'release-1') + self.assertEqual(response['items'][0]['match_candidates_json'][0]['provider'], 'musicbrainz') + self.assertEqual(response['items'][0]['dedupe_status'], 'unique') + self.assertEqual(response['items'][0]['dedupe_decision_json']['identity_basis'], 'recording_id') + self.assertEqual(response['items'][0]['organize_status'], 'organized') + self.assertEqual( + response['items'][0]['organize_decision_json']['final_relative_path'], + 'S/Song Artist/Album Name/01 - Song Title.flac' + ) + + def test_list_task_history_returns_empty_when_no_terminal_tasks(self): + response = self.task_store.list_task_history(page=1, page_size=8) + + self.assertEqual(response, {'items': [], 'page': 1, 'page_size': 8, 'total': 0}) + + def test_list_task_history_only_returns_completed_and_failed_tasks(self): + completed_task = self._create_terminal_task('completed', '2024-01-01T08:00:00Z') + failed_task = self._create_terminal_task('failed', '2024-01-02T08:00:00Z') + active_task = self.task_store.create_task_if_idle( + { + 'input': '/tmp/input', + 'output': '/tmp/output', + 'trash': '/tmp/trash' + } + ) + self.task_store.update_task(active_task['task_id'], status='running') + + response = self.task_store.list_task_history(page=1, page_size=10) + + self.assertEqual(response['total'], 2) + self.assertEqual( + [item['task_id'] for item in response['items']], + [failed_task['task_id'], completed_task['task_id']] + ) + + def test_list_task_history_orders_by_started_at_desc(self): + oldest = self._create_terminal_task('completed', '2024-01-01T08:00:00Z') + middle = self._create_terminal_task('completed', '2024-01-01T12:00:00Z') + newest = self._create_terminal_task('failed', '2024-01-02T09:30:00Z') + + response = self.task_store.list_task_history(page=1, page_size=10) + + self.assertEqual( + [item['task_id'] for item in response['items']], + [newest['task_id'], middle['task_id'], oldest['task_id']] + ) + + def test_list_task_history_aggregates_counts_and_report_status(self): + all_success = self._create_terminal_task('completed', '2024-01-01T08:00:00Z') + self._insert_history_item(all_success['task_id']) + self._insert_history_item(all_success['task_id']) + + partial_success = self._create_terminal_task('completed', '2024-01-02T08:00:00Z') + self._insert_history_item(partial_success['task_id']) + self._insert_history_item( + partial_success['task_id'], + match_status='low_score', + match_reason='score_gap_too_small', + match_message='匹配分过低', + dedupe_status='pending', + organize_status='pending' + ) + self._insert_history_item( + partial_success['task_id'], + preprocess_status='failed', + preprocess_reason='convert_failed', + preprocess_message='音频转码失败', + match_status='pending', + dedupe_status='pending', + organize_status='pending' + ) + + failed_task = self._create_terminal_task('failed', '2024-01-03T08:00:00Z') + self._insert_history_item(failed_task['task_id']) + + response = self.task_store.list_task_history(page=1, page_size=10) + items_by_task_id = {item['task_id']: item for item in response['items']} + + self.assertEqual(items_by_task_id[all_success['task_id']]['total_items'], 2) + self.assertEqual(items_by_task_id[all_success['task_id']]['success_items'], 2) + self.assertEqual(items_by_task_id[all_success['task_id']]['exception_items'], 0) + self.assertEqual(items_by_task_id[all_success['task_id']]['report_status'], 'success') + + self.assertEqual(items_by_task_id[partial_success['task_id']]['total_items'], 3) + self.assertEqual(items_by_task_id[partial_success['task_id']]['success_items'], 1) + self.assertEqual(items_by_task_id[partial_success['task_id']]['exception_items'], 2) + self.assertEqual(items_by_task_id[partial_success['task_id']]['report_status'], 'warning') + + self.assertEqual(items_by_task_id[failed_task['task_id']]['total_items'], 1) + self.assertEqual(items_by_task_id[failed_task['task_id']]['success_items'], 1) + self.assertEqual(items_by_task_id[failed_task['task_id']]['exception_items'], 0) + self.assertEqual(items_by_task_id[failed_task['task_id']]['report_status'], 'warning') + + def test_list_task_history_paginates_results(self): + first = self._create_terminal_task('completed', '2024-01-01T08:00:00Z') + second = self._create_terminal_task('completed', '2024-01-02T08:00:00Z') + third = self._create_terminal_task('failed', '2024-01-03T08:00:00Z') + + first_page = self.task_store.list_task_history(page=1, page_size=2) + second_page = self.task_store.list_task_history(page=2, page_size=2) + + self.assertEqual(first_page['total'], 3) + self.assertEqual(first_page['page'], 1) + self.assertEqual(first_page['page_size'], 2) + self.assertEqual(len(first_page['items']), 2) + self.assertEqual( + [item['task_id'] for item in first_page['items']], + [third['task_id'], second['task_id']] + ) + self.assertEqual(second_page['total'], 3) + self.assertEqual(second_page['page'], 2) + self.assertEqual(second_page['page_size'], 2) + self.assertEqual( + [item['task_id'] for item in second_page['items']], + [first['task_id']] + ) + + def _create_terminal_task(self, status: str, started_at: str) -> dict: + task = self.task_store.create_task_if_idle( + { + 'input': '/tmp/input', + 'output': '/tmp/output', + 'trash': '/tmp/trash' + } + ) + self.task_store.update_task(task['task_id'], status=status, completed_at=started_at) + + with self.task_store._connect() as connection: + connection.execute( + ''' + UPDATE task_runs + SET started_at = ?, updated_at = ?, completed_at = ? + WHERE id = ? + ''', + (started_at, started_at, started_at, task['task_id']) + ) + connection.commit() + + return self.task_store.get_task(task['task_id']) + + def _insert_history_item(self, task_id: str, **overrides) -> dict: + self._item_index += 1 + item_index = self._item_index + fields = { + 'original_path': f'/tmp/source-{item_index}.flac', + 'relative_path': f'Artist/Album/{item_index:02d}.flac', + 'filename': f'{item_index:02d}.flac', + 'extension': '.flac', + 'size_bytes': 123, + 'modified_at': '2024-01-01T00:00:00Z', + 'local_cover': None, + 'local_lyric': None, + 'scan_status': 'queued', + 'scan_reason': None, + 'scan_message': None, + 'preprocess_status': 'completed', + 'preprocess_reason': None, + 'preprocess_message': None, + 'match_status': 'matched', + 'match_reason': None, + 'match_message': None, + 'dedupe_status': 'unique', + 'dedupe_reason': None, + 'dedupe_message': '未发现重复项', + 'organize_status': 'organized', + 'organize_reason': None, + 'organize_message': '已按标准路径入库' + } + fields.update(overrides) + return self.task_store.insert_task_item(task_id, **fields) + + +@unittest.skipIf(main_module is None, f'api deps unavailable: {FASTAPI_IMPORT_ERROR}') +class TaskHistoryApiTests(unittest.TestCase): + def setUp(self): + self.previous_task_store = main_module.task_store + + def tearDown(self): + main_module.task_store = self.previous_task_store + + def test_get_tasks_returns_paginated_history_payload(self): + fake_store = _FakeTaskStore( + { + 'items': [ + { + 'task_id': 'task-2', + 'started_at': '2024-01-03T12:00:00Z', + 'status': 'failed', + 'total_items': 5, + 'success_items': 3, + 'exception_items': 2, + 'report_status': 'warning' + }, + { + 'task_id': 'task-1', + 'started_at': '2024-01-02T12:00:00Z', + 'status': 'completed', + 'total_items': 4, + 'success_items': 4, + 'exception_items': 0, + 'report_status': 'success' + } + ], + 'page': 2, + 'page_size': 2, + 'total': 7 + } + ) + main_module.task_store = fake_store + + response = main_module.get_tasks(page=2, page_size=2) + payload = TaskHistoryListResponse.model_validate(response) + + self.assertEqual(payload.page, 2) + self.assertEqual(payload.page_size, 2) + self.assertEqual(payload.total, 7) + self.assertEqual(payload.items[0].task_id, 'task-2') + self.assertEqual(payload.items[0].report_status, 'warning') + self.assertEqual(payload.items[1].report_status, 'success') + self.assertEqual(fake_store.calls, [{'page': 2, 'page_size': 2}]) + + def test_get_tasks_returns_empty_payload(self): + fake_store = _FakeTaskStore({'items': [], 'page': 1, 'page_size': 8, 'total': 0}) + main_module.task_store = fake_store + + response = main_module.get_tasks(page=1, page_size=8) + payload = TaskHistoryListResponse.model_validate(response) + + self.assertEqual(payload.items, []) + self.assertEqual(payload.total, 0) + self.assertEqual(fake_store.calls, [{'page': 1, 'page_size': 8}]) + + +class _FakeTaskStore: + def __init__(self, response: dict): + self.response = response + self.calls: list[dict] = [] + + def list_task_history(self, page: int, page_size: int) -> dict: + self.calls.append({'page': page, 'page_size': page_size}) + return self.response + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_task_runner_preprocess.py b/backend/tests/test_task_runner_preprocess.py new file mode 100644 index 0000000..7df8bca --- /dev/null +++ b/backend/tests/test_task_runner_preprocess.py @@ -0,0 +1,540 @@ +import math +import os +import struct +import tempfile +import unittest +import wave +from pathlib import Path +from unittest.mock import patch + +from backend.app.matcher import MatchProviderError +from backend.app.preprocessor import PreprocessDependencyError, Preprocessor +from backend.app.scanner import Scanner +from backend.app.task_runner import TaskRunner +from backend.app.task_store import TaskStore +from backend.app.task_stream import TaskStreamManager + + +class TaskRunnerPreprocessTests(unittest.TestCase): + def test_task_runner_completes_full_pipeline(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + + source_file = input_dir / 'Artist' / 'Album' / '01.wav' + source_file.parent.mkdir(parents=True) + _write_wave_file(source_file, duration_seconds=8) + stable_timestamp = source_file.stat().st_mtime - 120 + os.utime(source_file, (stable_timestamp, stable_timestamp)) + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + task_runner = TaskRunner( + task_store, + Scanner(), + Preprocessor(), + TaskStreamManager(), + matcher=StaticMatcher() + ) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + task = task_store.create_task_if_idle(config_snapshot) + + task_runner.start_task(task['task_id'], config_snapshot) + + persisted_task = task_store.get_task(task['task_id']) + persisted_items = task_store.list_task_items(task['task_id'], None, 1, 20)['items'] + + self.assertEqual(persisted_task['status'], 'completed') + self.assertEqual(persisted_task['current_stage'], 'complete') + self.assertEqual(persisted_task['stage_states']['preprocess'], 'completed') + self.assertEqual(persisted_task['stage_states']['match'], 'completed') + self.assertEqual(persisted_task['stage_states']['dedupe'], 'completed') + self.assertEqual(persisted_task['stage_states']['organize'], 'completed') + self.assertEqual(persisted_task['stats']['scan']['queued'], 1) + self.assertEqual(persisted_task['stats']['preprocess']['input_items'], 1) + self.assertEqual(persisted_task['stats']['preprocess']['converted_items'], 1) + self.assertEqual(persisted_task['stats']['preprocess']['output_items'], 1) + self.assertEqual(persisted_task['stats']['match']['matched_authoritative'], 1) + self.assertEqual(persisted_task['stats']['dedupe']['kept_items'], 1) + self.assertEqual(persisted_task['stats']['organize']['moved_items'], 1) + self.assertEqual(len(persisted_items), 1) + + item = persisted_items[0] + self.assertEqual(item['preprocess_status'], 'completed') + self.assertEqual(item['match_status'], 'matched') + self.assertEqual(item['dedupe_status'], 'unique') + self.assertEqual(item['organize_status'], 'organized') + self.assertTrue(item['current_file_path'].endswith('.flac')) + self.assertTrue(Path(item['current_file_path']).exists()) + self.assertTrue(str(output_dir) in item['current_file_path']) + self.assertEqual(item['audio_props_json']['codec'], 'FLAC') + self.assertTrue(item['acoustic_fingerprint']) + self.assertEqual(item['matched_metadata_json']['release_id'], 'release-1') + self.assertEqual(item['library_relative_path'], 'M/Matched Artist/Matched Album/01 - Matched Song.flac') + + def test_task_runner_fails_fast_when_preprocess_dependencies_are_missing(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + + source_file = input_dir / 'single.flac' + source_file.write_bytes(b'not-real-audio') + stable_timestamp = source_file.stat().st_mtime - 120 + os.utime(source_file, (stable_timestamp, stable_timestamp)) + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + preprocessor = Preprocessor() + task_runner = TaskRunner( + task_store, + Scanner(), + preprocessor, + TaskStreamManager(), + matcher=StaticMatcher() + ) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + task = task_store.create_task_if_idle(config_snapshot) + + with patch.object( + preprocessor, + 'check_dependencies', + side_effect=PreprocessDependencyError('missing preprocess dependencies') + ): + task_runner.start_task(task['task_id'], config_snapshot) + + persisted_task = task_store.get_task(task['task_id']) + + self.assertEqual(persisted_task['status'], 'failed') + self.assertEqual(persisted_task['current_stage'], 'preprocess') + self.assertEqual(persisted_task['stage_states']['preprocess'], 'failed') + self.assertEqual( + persisted_task['error_message'], + 'missing preprocess dependencies' + ) + + def test_task_runner_marks_match_item_failed_when_provider_errors(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + + source_file = input_dir / 'Artist' / 'Album' / '01.wav' + source_file.parent.mkdir(parents=True) + _write_wave_file(source_file, duration_seconds=8) + stable_timestamp = source_file.stat().st_mtime - 120 + os.utime(source_file, (stable_timestamp, stable_timestamp)) + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + task_runner = TaskRunner( + task_store, + Scanner(), + Preprocessor(), + TaskStreamManager(), + matcher=ErrorMatcher() + ) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + task = task_store.create_task_if_idle(config_snapshot) + + task_runner.start_task(task['task_id'], config_snapshot) + + persisted_task = task_store.get_task(task['task_id']) + persisted_items = task_store.list_task_items(task['task_id'], None, 1, 20)['items'] + + self.assertEqual(persisted_task['status'], 'completed') + self.assertEqual(persisted_task['stats']['match']['failed_items'], 1) + self.assertEqual(persisted_items[0]['match_status'], 'failed') + self.assertEqual(persisted_items[0]['match_reason'], 'provider_error') + self.assertFalse(source_file.exists()) + self.assertFalse(persisted_items[0]['is_active']) + self.assertTrue(Path(persisted_items[0]['current_file_path']).exists()) + self.assertTrue(Path(persisted_items[0]['trash_file_path']).exists()) + self.assertIn('/trash/match_failed/', persisted_items[0]['current_file_path']) + + def test_task_runner_quarantines_historical_exception_before_scan(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + + source_file = input_dir / 'failed.flac' + source_file.write_bytes(b'audio') + stable_timestamp = source_file.stat().st_mtime - 120 + os.utime(source_file, (stable_timestamp, stable_timestamp)) + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + previous_task = task_store.create_task_if_idle(config_snapshot) + previous_item = task_store.insert_task_item( + previous_task['task_id'], + original_path=str(source_file), + current_file_path=str(source_file), + relative_path='failed.flac', + filename='failed.flac', + extension='.flac', + size_bytes=source_file.stat().st_size, + modified_at=None, + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='failed', + match_reason='provider_error', + match_message='provider exploded' + ) + task_store.update_task(previous_task['task_id'], status='completed', current_stage='complete') + + task = task_store.create_task_if_idle(config_snapshot) + task_runner = TaskRunner( + task_store, + Scanner(), + Preprocessor(), + TaskStreamManager(), + matcher=StaticMatcher() + ) + + task_runner.start_task(task['task_id'], config_snapshot) + + persisted_task = task_store.get_task(task['task_id']) + quarantined_item = task_store.get_exception_source_item(previous_item['id']) + + self.assertEqual(persisted_task['status'], 'completed') + self.assertEqual(persisted_task['stats']['scan']['total_found'], 0) + self.assertEqual(persisted_task['stats']['scan']['queued'], 0) + self.assertFalse(source_file.exists()) + self.assertFalse(quarantined_item['is_active']) + self.assertTrue(Path(quarantined_item['trash_file_path']).exists()) + self.assertIn('/trash/match_failed/', quarantined_item['trash_file_path']) + + def test_task_runner_skips_exception_with_existing_trash_path(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + + source_file = input_dir / 'duplicate.flac' + source_file.write_bytes(b'audio') + existing_trash = trash_dir / 'duplicates' / 'old-task' / '1_duplicate.flac' + existing_trash.parent.mkdir(parents=True) + existing_trash.write_bytes(b'audio') + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + task = task_store.create_task_if_idle(config_snapshot) + item = task_store.insert_task_item( + task['task_id'], + original_path=str(source_file), + current_file_path=str(existing_trash), + relative_path='duplicate.flac', + filename='duplicate.flac', + extension='.flac', + size_bytes=source_file.stat().st_size, + modified_at=None, + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='matched', + dedupe_status='duplicate_trashed', + trash_file_path=str(existing_trash) + ) + task_runner = TaskRunner( + task_store, + Scanner(), + Preprocessor(), + TaskStreamManager(), + matcher=StaticMatcher() + ) + + task_runner._quarantine_exception_items(task['task_id'], config_snapshot, scope='current') + + persisted_item = task_store.get_exception_source_item(item['id']) + self.assertTrue(source_file.exists()) + self.assertTrue(existing_trash.exists()) + self.assertEqual(persisted_item['trash_file_path'], str(existing_trash)) + + def test_task_runner_quarantines_converted_exception_and_original_source(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + workspace_dir = root / 'workspace' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + workspace_dir.mkdir() + + source_file = input_dir / 'source.wav' + converted_file = workspace_dir / 'source.flac' + source_file.write_bytes(b'wav') + converted_file.write_bytes(b'flac') + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + task = task_store.create_task_if_idle(config_snapshot) + item = task_store.insert_task_item( + task['task_id'], + original_path=str(source_file), + current_file_path=str(converted_file), + relative_path='source.wav', + filename='source.flac', + extension='.flac', + size_bytes=converted_file.stat().st_size, + modified_at=None, + local_cover=None, + local_lyric=None, + scan_status='queued', + scan_reason=None, + scan_message=None, + preprocess_status='completed', + match_status='low_score', + match_reason='score_below_threshold', + match_message='候选分数不足' + ) + task_runner = TaskRunner( + task_store, + Scanner(), + Preprocessor(), + TaskStreamManager(), + matcher=StaticMatcher() + ) + + task_runner._quarantine_exception_items(task['task_id'], config_snapshot, scope='current') + + quarantined_item = task_store.get_exception_source_item(item['id']) + moved_paths = sorted((trash_dir / 'low_score' / task['task_id']).glob('*')) + self.assertFalse(source_file.exists()) + self.assertFalse(converted_file.exists()) + self.assertEqual(len(moved_paths), 2) + self.assertFalse(quarantined_item['is_active']) + self.assertTrue(Path(quarantined_item['current_file_path']).exists()) + self.assertTrue(Path(quarantined_item['trash_file_path']).exists()) + self.assertEqual( + Path(quarantined_item['current_file_path']).name, + f'{item["id"]}_source.flac' + ) + + def test_task_runner_logs_provider_warnings_and_continues_matching(self): + root = Path(tempfile.mkdtemp()) + input_dir = root / 'input' + output_dir = root / 'output' + trash_dir = root / 'trash' + input_dir.mkdir() + output_dir.mkdir() + trash_dir.mkdir() + + source_file = input_dir / 'Artist' / 'Album' / '01.wav' + source_file.parent.mkdir(parents=True) + _write_wave_file(source_file, duration_seconds=8) + stable_timestamp = source_file.stat().st_mtime - 120 + os.utime(source_file, (stable_timestamp, stable_timestamp)) + + db_path = root / 'music_workshop.db' + task_store = TaskStore(db_path) + task_runner = TaskRunner( + task_store, + Scanner(), + Preprocessor(), + TaskStreamManager(), + matcher=WarningMatcher() + ) + config_snapshot = { + 'input': str(input_dir), + 'output': str(output_dir), + 'trash': str(trash_dir) + } + task = task_store.create_task_if_idle(config_snapshot) + + task_runner.start_task(task['task_id'], config_snapshot) + + persisted_task = task_store.get_task(task['task_id']) + persisted_items = task_store.list_task_items(task['task_id'], None, 1, 20)['items'] + persisted_logs = task_store.list_task_logs(task['task_id'], 1, 50)['logs'] + + self.assertEqual(persisted_task['status'], 'completed') + self.assertEqual(persisted_task['stats']['match']['matched_fallback'], 1) + self.assertEqual(persisted_task['stats']['match']['provider_warnings'], 2) + self.assertEqual(persisted_task['stats']['match']['failed_items'], 0) + self.assertEqual(persisted_items[0]['match_status'], 'matched_fallback') + self.assertEqual( + [log['event_type'] for log in persisted_logs if log['event_type'] == 'match.provider_skipped'], + ['match.provider_skipped', 'match.provider_skipped'] + ) + + +class StaticMatcher: + def match_item(self, item, album_group, config): + return { + 'status': 'matched', + 'reason': 'authoritative_auto_match', + 'message': '静态测试匹配成功', + 'source': 'musicbrainz', + 'confidence': 93.5, + 'is_authoritative': True, + 'matched_metadata_json': { + 'title': 'Matched Song', + 'artist': 'Matched Artist', + 'artists': ['Matched Artist'], + 'album': 'Matched Album', + 'album_artist': 'Matched Artist', + 'track_number': 1, + 'disc_number': 1, + 'release_date': '2024-01-01', + 'year': 2024, + 'duration_seconds': 8.0, + 'recording_id': 'recording-1', + 'release_id': 'release-1', + 'release_group_id': 'release-group-1', + 'source_ids': {'musicbrainz_recording_id': 'recording-1'} + }, + 'match_candidates_json': [ + { + 'provider': 'musicbrainz', + 'score': 93.5, + 'score_breakdown': {'title': 20}, + 'is_authoritative': True, + 'recording_id': 'recording-1', + 'release_id': 'release-1', + 'release_group_id': 'release-group-1', + 'source_ids': {'musicbrainz_recording_id': 'recording-1'} + } + ], + 'match_enrichment_json': { + 'cover': {'selected_source': None, 'candidates': []}, + 'lyrics': {'selected_source': None, 'candidates': []}, + 'genres': {'selected_source': None, 'candidates': []}, + 'tags': {'selected_source': None, 'candidates': []} + }, + 'provider_warnings': [] + } + + +class ErrorMatcher: + def match_item(self, item, album_group, config): + raise MatchProviderError('musicbrainz', 'provider exploded') + + +class WarningMatcher: + def match_item(self, item, album_group, config): + return { + 'status': 'matched_fallback', + 'reason': 'fallback_auto_match', + 'message': 'Fallback 候选自动匹配成功,得分 88.0', + 'source': 'qq', + 'confidence': 88.0, + 'is_authoritative': False, + 'matched_metadata_json': { + 'title': 'Matched Song', + 'artist': 'Matched Artist', + 'artists': ['Matched Artist'], + 'album': 'Matched Album', + 'album_artist': 'Matched Artist', + 'track_number': 1, + 'disc_number': 1, + 'release_date': '2024-01-01', + 'year': 2024, + 'duration_seconds': 8.0, + 'recording_id': None, + 'release_id': None, + 'release_group_id': None, + 'source_ids': {'qq_song_mid': 'song-1'} + }, + 'match_candidates_json': [ + { + 'provider': 'qq', + 'score': 88.0, + 'score_breakdown': {'title': 20}, + 'is_authoritative': False, + 'recording_id': None, + 'release_id': None, + 'release_group_id': None, + 'source_ids': {'qq_song_mid': 'song-1'} + } + ], + 'match_enrichment_json': { + 'cover': {'selected_source': None, 'candidates': []}, + 'lyrics': {'selected_source': None, 'candidates': []}, + 'genres': {'selected_source': None, 'candidates': []}, + 'tags': {'selected_source': None, 'candidates': []} + }, + 'provider_warnings': [ + { + 'provider': 'acoustid', + 'message': 'acoustid 请求失败 (HTTP 400) {"error":{"code":4,"message":"invalid API key"}}' + }, + { + 'provider': 'spotify', + 'message': 'spotify 请求失败 (HTTP 403) Active premium subscription required for the owner of the app.' + } + ] + } + + +def _write_wave_file(path: Path, *, duration_seconds: int): + sample_rate = 44100 + frequency = 440.0 + amplitude = 16000 + total_frames = sample_rate * duration_seconds + + with wave.open(str(path), 'wb') as handle: + handle.setnchannels(1) + handle.setsampwidth(2) + handle.setframerate(sample_rate) + + frames = bytearray() + for index in range(total_frames): + sample = int(amplitude * math.sin((2.0 * math.pi * frequency * index) / sample_rate)) + frames.extend(struct.pack('=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, "node_modules/cssesc": { "version": "3.0.0", "resolved": "https://registry.npmmirror.com/cssesc/-/cssesc-3.0.0.tgz", @@ -2207,6 +2221,44 @@ "node": ">=0.10.0" } }, + "node_modules/react-router": { + "version": "7.14.2", + "resolved": "https://registry.npmmirror.com/react-router/-/react-router-7.14.2.tgz", + "integrity": "sha512-yCqNne6I8IB6rVCH7XUvlBK7/QKyqypBFGv+8dj4QBFJiiRX+FG7/nkdAvGElyvVZ/HQP5N19wzteuTARXi5Gw==", + "license": "MIT", + "dependencies": { + "cookie": "^1.0.1", + "set-cookie-parser": "^2.6.0" + }, + "engines": { + "node": ">=20.0.0" + }, + "peerDependencies": { + "react": ">=18", + "react-dom": ">=18" + }, + "peerDependenciesMeta": { + "react-dom": { + "optional": true + } + } + }, + "node_modules/react-router-dom": { + "version": "7.14.2", + "resolved": "https://registry.npmmirror.com/react-router-dom/-/react-router-dom-7.14.2.tgz", + "integrity": "sha512-YZcM5ES8jJSM+KrJ9BdvHHqlnGTg5tH3sC5ChFRj4inosKctdyzBDhOyyHdGk597q2OT6NTrCA1OvB/YDwfekQ==", + "license": "MIT", + "dependencies": { + "react-router": "7.14.2" + }, + "engines": { + "node": ">=20.0.0" + }, + "peerDependencies": { + "react": ">=18", + "react-dom": ">=18" + } + }, "node_modules/read-cache": { "version": "1.0.0", "resolved": "https://registry.npmmirror.com/read-cache/-/read-cache-1.0.0.tgz", @@ -2351,6 +2403,12 @@ "semver": "bin/semver.js" } }, + "node_modules/set-cookie-parser": { + "version": "2.7.2", + "resolved": "https://registry.npmmirror.com/set-cookie-parser/-/set-cookie-parser-2.7.2.tgz", + "integrity": "sha512-oeM1lpU/UvhTxw+g3cIfxXHyJRc/uidd3yK1P242gzHds0udQBYzs3y8j4gCCW+ZJ7ad0yctld8RYO+bdurlvw==", + "license": "MIT" + }, "node_modules/source-map-js": { "version": "1.2.1", "resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.2.1.tgz", diff --git a/frontend/package.json b/frontend/package.json index d0236ad..b9ecb7e 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -11,7 +11,8 @@ "dependencies": { "lucide-react": "^0.525.0", "react": "^18.3.1", - "react-dom": "^18.3.1" + "react-dom": "^18.3.1", + "react-router-dom": "^7.14.2" }, "devDependencies": { "@vitejs/plugin-react": "^4.3.1", diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index d314fb2..5985e45 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -1,2179 +1,78 @@ -import React, { useEffect, useRef, useState } from 'react'; -import { - Activity, - AlertCircle, - AlertTriangle, - ArrowRightLeft, - BellRing, - CalendarClock, - Check, - CheckCircle2, - ChevronLeft, - ChevronRight, - Clock, - Database, - Disc, - Edit3, - FileText, - Folder, - History, - ImageIcon, - ImagePlus, - LayoutDashboard, - Layers, - ListChecks, - Mail, - MessageSquare, - Mic2, - Music, - PauseCircle, - Play, - PlayCircle, - RefreshCw, - RotateCcw, - Search, - Send, - Settings, - SlidersHorizontal, - Trash2, - Volume2, - Wifi, - X, - XCircle -} from 'lucide-react'; +import { useEffect, useState } from 'react'; +import { BrowserRouter, Navigate, Route, Routes } from 'react-router-dom'; +import { fetchConfig } from './api/config'; +import { fetchCurrentTask } from './api/tasks'; +import AppLayout from './components/AppLayout'; +import { createInitialConfig, deriveTaskState } from './constants'; +import ExceptionPage from './pages/ExceptionPage'; +import HistoryPage from './pages/HistoryPage'; +import LibraryPage from './pages/LibraryPage'; +import SettingsPage from './pages/SettingsPage'; +import WorkbenchPage from './pages/WorkbenchPage'; -const STAGES = [ - { id: 'scan', name: '扫描目录' }, - { id: 'preprocess', name: '音频预处理' }, - { id: 'match', name: '音乐匹配' }, - { id: 'dedupe', name: '重复检测' }, - { id: 'organize', name: '整理入库' }, - { id: 'complete', name: '批次完成' } -]; - -const TRASH_REASONS = [ - { id: 'low_score', name: '匹配分过低' }, - { id: 'missing_tags', name: '元数据缺失' }, - { id: 'duplicates', name: '文件重复' }, - { id: 'convert_failed', name: '转码失败' }, - { id: 'match_failed', name: '匹配失败' }, - { id: 'organize_failed', name: '入库失败' } -]; - -export default function MusicFlowApp() { - const [activeTab, setActiveTab] = useState('workbench'); - const [config, setConfig] = useState({ - input: '/volume1/downloads/music', - output: '/volume1/docker/navidrome/music', - trash: '/volume1/docker/navidrome/trash', - schedule: { enabled: true, cron: '0 2 * * *' }, - notifications: { - dingtalkWebhook: '', - dingtalkSecret: '', - telegramBotToken: '', - telegramChatId: '', - emailSmtp: '', - emailUser: '', - emailPass: '', - emailTo: '' - }, - metadata: { - musicbrainz: 'https://musicbrainz.org/ws/2/', - netease: 'http://localhost:3000', - qq: 'http://localhost:3300', - spotifyUrl: 'https://api.spotify.com/v1', - spotifyClientId: '', - spotifySecret: '', - discogsUrl: 'https://api.discogs.com', - discogsToken: '', - lastfmUrl: 'https://ws.audioscrobbler.com/2.0/', - lastfmKey: '', - geniusUrl: 'https://api.genius.com', - geniusToken: '' - } - }); +export default function App() { + const [config, setConfig] = useState(createInitialConfig); const [taskState, setTaskState] = useState('ready'); - const [connState] = useState('connected'); - const [progress, setProgress] = useState({ - stageIndex: 0, - percent: 0, - currentFile: '-', - stats: { total: 0, processed: 0, success: 0, failed: 0, skipped: 0 }, - logs: [], - trashDistribution: { - low_score: 0, - missing_tags: 0, - duplicates: 0, - convert_failed: 0, - match_failed: 0, - organize_failed: 0 - } - }); + const [connState, setConnState] = useState('polling'); useEffect(() => { - if (taskState !== 'running') return undefined; + let isMounted = true; - let currentPercent = 0; - let currentStage = 0; - const totalFiles = Math.floor(Math.random() * 50) + 150; - let processedFiles = 0; - - setProgress((prev) => ({ - ...prev, - stageIndex: 0, - percent: 0, - stats: { total: totalFiles, processed: 0, success: 0, failed: 0, skipped: 0 }, - logs: [ - { - time: new Date().toLocaleTimeString(), - type: 'info', - text: '任务已启动,正在初始化工作流...' - } - ], - trashDistribution: { - low_score: 0, - missing_tags: 0, - duplicates: 0, - convert_failed: 0, - match_failed: 0, - organize_failed: 0 + async function loadConfig() { + try { + const [remoteConfig, currentTaskResponse] = await Promise.all([ + fetchConfig(), + fetchCurrentTask() + ]); + if (!isMounted) return; + setConfig(remoteConfig); + setTaskState(deriveTaskState(remoteConfig, currentTaskResponse.task)); + } catch (error) { + console.error('Failed to load config:', error); } - })); + } - const timer = setInterval(() => { - currentPercent += Math.random() * 2; + loadConfig(); - if (currentPercent >= 100) { - currentPercent = 100; - setTaskState('completed'); - clearInterval(timer); - } - - if (currentPercent < 10) currentStage = 0; - else if (currentPercent < 30) currentStage = 1; - else if (currentPercent < 60) currentStage = 2; - else if (currentPercent < 75) currentStage = 3; - else if (currentPercent < 95) currentStage = 4; - else currentStage = 5; - - if (currentPercent > 10 && currentPercent < 95) { - processedFiles = Math.floor(((currentPercent - 10) / 85) * totalFiles); - } else if (currentPercent >= 95) { - processedFiles = totalFiles; - } - - let newLog = null; - let newTrash = null; - if (Math.random() > 0.7 && currentStage > 0 && currentStage < 5) { - const file = `Track_${Math.floor(Math.random() * 9000)}.flac`; - const isError = Math.random() > 0.8; - - if (isError) { - const reasonId = - TRASH_REASONS[Math.floor(Math.random() * TRASH_REASONS.length)].id; - newTrash = reasonId; - newLog = { - time: new Date().toLocaleTimeString(), - type: 'error', - text: `[${STAGES[currentStage].name}] 异常: ${file} -> 移至回收站 (${reasonId})` - }; - } else { - newLog = { - time: new Date().toLocaleTimeString(), - type: 'success', - text: `[${STAGES[currentStage].name}] 成功处理: ${file}` - }; - } - } - - setProgress((prev) => { - const nextStats = { ...prev.stats, processed: processedFiles }; - if (newTrash) nextStats.failed += 1; - else if (newLog?.type === 'success') { - nextStats.success = processedFiles - nextStats.failed - nextStats.skipped; - } - - return { - ...prev, - percent: currentPercent, - stageIndex: currentStage, - currentFile: - currentPercent < 100 - ? `正在处理目录: /folder/album_${Math.floor(currentPercent)}/track.flac` - : '处理完成', - stats: nextStats, - logs: newLog ? [...prev.logs, newLog] : prev.logs, - trashDistribution: newTrash - ? { - ...prev.trashDistribution, - [newTrash]: prev.trashDistribution[newTrash] + 1 - } - : prev.trashDistribution - }; - }); - }, 200); - - return () => clearInterval(timer); - }, [taskState]); + return () => { + isMounted = false; + }; + }, []); return ( -
-
-
- - 音流工坊 -
-
- 主菜单 -
- -
- Navidrome Auto-Ingest Engine v1.2.0 -
-
- -
-
-

- {activeTab === 'workbench' - ? '工作台' - : activeTab === 'library' - ? '音乐库' - : activeTab === 'exceptions' - ? '异常中心' - : activeTab === 'history' - ? '任务历史' - : '系统配置'} -

-
-
- {connState === 'connected' ? ( - <> - - - - - - - 实时连接中 (WS) - - - ) : ( - <> - - 轮询兜底中 - - )} -
-
- 系统状态: - - {taskState === 'unconfigured' - ? '未配置' - : taskState === 'ready' - ? '已配置,待机中' - : taskState === 'running' - ? '任务执行中' - : '批次完成'} - -
-
-
- -
- {activeTab === 'workbench' && ( - setActiveTab('settings')} - /> - )} - {activeTab === 'library' && } - {activeTab === 'exceptions' && } - {activeTab === 'history' && } - {activeTab === 'settings' && ( - - )} -
-
-
- ); -} - -function NavButton({ active, icon: Icon, label, onClick }) { - return ( - - ); -} - -function WorkbenchTab({ - config, - taskState, - setTaskState, - progress, - setProgress, - onNavigateToSettings -}) { - const isRunning = taskState === 'running'; - const isCompleted = taskState === 'completed'; - const logsEndRef = useRef(null); - - useEffect(() => { - if (logsEndRef.current) { - logsEndRef.current.scrollIntoView({ behavior: 'smooth' }); - } - }, [progress.logs]); - - const handleStart = () => { - if (!config.input || !config.output || !config.trash) { - window.alert('请先在设置中配置目录!'); - return; - } - setTaskState('running'); - }; - - const handleReset = () => { - setTaskState('ready'); - setProgress({ - stageIndex: 0, - percent: 0, - currentFile: '-', - stats: { total: 0, processed: 0, success: 0, failed: 0, skipped: 0 }, - logs: [], - trashDistribution: { - low_score: 0, - missing_tags: 0, - duplicates: 0, - convert_failed: 0, - match_failed: 0, - organize_failed: 0 - } - }); - }; - - return ( -
-
-
-
-

- - 处理编排 -

- -
-
- - - - {(!config.input || !config.output || !config.trash) && ( -
- -
- )} -
-
- -
- {!isRunning && !isCompleted && ( - - )} - - {isRunning && ( -
-
- -
-

处理引擎运行中

-

- 请勿关闭当前页面,系统正在自动流转... -

-
- )} - - {isCompleted && ( -
-
- -
-

批次任务已完成

- -
- )} -
-
- -
-
-
-
-
- {STAGES.map((stage, idx) => { - const isActive = idx === progress.stageIndex && isRunning; - const isPast = idx < progress.stageIndex || isCompleted; - - return ( -
-
- {isPast && !isActive ? : idx + 1} -
- - {stage.name} - -
- ); - })} -
-
- -
-
-
-
- 当前进度 -
-
- {progress.currentFile} -
-
-
- {progress.percent.toFixed(1)}% -
-
-
-
-
-
- -
-
-
-

- 数据统计 -

-
- - - - -
-
- -
-
-

- - 批次报告 -

- {isCompleted && } -
-

- {isCompleted - ? '已生成完整运行报告,点击查看详细摘要、处理清单与归档路径。' - : '任务完成后生成报告...'} -

-
-
- -
-

- 异常回收分布 - -

-
- {TRASH_REASONS.map((reason) => { - const count = progress.trashDistribution[reason.id]; - const max = Math.max(1, progress.stats.failed); - const percent = (count / max) * 100; - return ( -
-
- {reason.name} - {count} -
-
-
0 ? 'bg-rose-500' : 'bg-slate-800' - }`} - style={{ width: `${percent}%` }} - /> -
-
- ); - })} -
-
- -
-
- - 任务记录流 -
-
- {progress.logs.length === 0 ? ( -
等待任务启动...
- ) : ( - progress.logs.map((log, i) => ( -
- [{log.time}] - - {log.text} - -
- )) - )} -
-
-
-
-
-
- ); -} - -function DirectoryField({ label, value, missingText }) { - return ( -
- -
- {value || missingText} -
-
- ); -} - -function StatCard({ label, value, labelClass = 'text-slate-500', valueClass, error = false }) { - return ( -
-
{label}
-
{value}
-
- ); -} - -const MOCK_LIBRARY_TRACKS = Array.from({ length: 45 }).map((_, idx) => ({ - id: idx + 1, - file: `track_${1000 + idx}.flac`, - title: `Beautiful Day ${idx + 1}`, - artist: idx % 3 === 0 ? 'Coldplay Cover' : 'U2 Cover Band', - album: `Greatest Hits ${2020 + (idx % 4)}`, - hasCover: idx % 5 !== 0, - hasLyrics: idx % 4 !== 0, - time: `2023-10-26 14:${String(idx % 60).padStart(2, '0')}` -})); - -function LibraryTab() { - const [playingId, setPlayingId] = useState(null); - const [currentPage, setCurrentPage] = useState(1); - const itemsPerPage = 8; - - const totalPages = Math.ceil(MOCK_LIBRARY_TRACKS.length / itemsPerPage); - const currentTracks = MOCK_LIBRARY_TRACKS.slice( - (currentPage - 1) * itemsPerPage, - currentPage * itemsPerPage - ); - - return ( -
-
-
-

总体音乐库管理

-

- 查看当前 Navidrome 库中的统计信息与成功入库的文件。 -

-
- -
- -
- - - - -
- -
-
-

最近成功入库记录

-
- - -
-
- - - - - - - - - - - - - - {currentTracks.map((track) => ( - - setPlayingId(playingId === track.id ? null : track.id) - } - > - - - - - - - - - ))} - -
试听源文件名称识别标题识别艺术家识别专辑资产状态入库时间
- - - {track.file} - - {track.title} - {track.artist}{track.album} -
- - -
-
{track.time}
- - setCurrentPage((p) => Math.max(1, p - 1))} - onNext={() => setCurrentPage((p) => Math.min(totalPages, p + 1))} - summary={ - <> - 显示 {(currentPage - 1) * itemsPerPage + 1} 到{' '} - {Math.min(currentPage * itemsPerPage, MOCK_LIBRARY_TRACKS.length)} 条,共{' '} - - {MOCK_LIBRARY_TRACKS.length} - {' '} - 条记录 - - } - /> -
-
- ); -} - -function MetricCard({ icon: Icon, label, value, iconClass }) { - return ( -
-
- -
-
-
{label}
-
{value}
-
-
- ); -} - -function AssetChip({ active, activeClass, label }) { - return ( - - {label} - - ); -} - -function ExceptionTab() { - const [selectedException, setSelectedException] = useState(null); - const [selectedIds, setSelectedIds] = useState([]); - const [isPlaying, setIsPlaying] = useState(false); - - const mockExceptions = [ - { - id: 1, - file: 'Unknown_Audio_Rip.mp3', - type: 'missing_tags', - reason: '无法提取有效元数据', - date: '2023-10-26 14:22' - }, - { - id: 2, - file: 'Track01 - Copy.flac', - type: 'duplicates', - reason: '与库中已存在曲目高度相似', - date: '2023-10-26 14:25' - }, - { - id: 3, - file: 'Local_Band_Demo.wav', - type: 'match_failed', - reason: 'MusicBrainz 查无此曲', - date: '2023-10-26 14:30' - }, - { - id: 4, - file: 'Voice_Memo_002.m4a', - type: 'low_score', - reason: '指纹匹配置信度极低 (12%)', - date: '2023-10-26 14:35' - }, - { - id: 5, - file: 'Live_Concert_Bootleg.flac', - type: 'match_failed', - reason: '未找到匹配的专辑信息', - date: '2023-10-26 15:01' - } - ]; - - const handleSelectAll = (e) => { - setSelectedIds(e.target.checked ? mockExceptions.map((exc) => exc.id) : []); - }; - - const handleSelectOne = (id) => { - setSelectedIds((prev) => - prev.includes(id) ? prev.filter((itemId) => itemId !== id) : [...prev, id] - ); - }; - - const handleRowClick = (exc) => { - setSelectedException(exc); - setIsPlaying(false); - }; - - return ( -
-
-
-
-
-

- - 异常隔离池 -

-

- 多次任务中产生的异常文件集中处理。支持在线试听与批量操作。 -

-
- -
-
- -
- - - - - - - - - - - {mockExceptions.map((exc) => ( - handleRowClick(exc)} - > - - - - - - ))} - -
- 0 - } - onChange={handleSelectAll} - /> - 异常文件失败归类详细原因
e.stopPropagation()}> - handleSelectOne(exc.id)} - /> - - {exc.file} - - - {TRASH_REASONS.find((r) => r.id === exc.type)?.name || exc.type} - - {exc.reason}
-
- - {selectedIds.length > 0 && ( -
- - - 已选择 {selectedIds.length} 个异常文件 - -
- - -
-
- )} -
- -
- {!selectedException ? ( -
- -

点击左侧列表中的文件展开操作面板

-
- ) : ( -
-

- 文件处理决策 -

- -
-
当前文件
-
- {selectedException.file} -
-
捕获时间
-
{selectedException.date}
-
- -
-
-
- {selectedException.type === 'missing_tags' ? ( - - ) : ( -
- -
- )} -
- - -
-
- - {isPlaying ? '01:14' : '00:00'} - - 04:32 -
-
-
-
-
- -
-
- Audio Preview • 44.1kHz • 320kbps -
-
- - {(selectedException.type === 'missing_tags' || - selectedException.type === 'match_failed' || - selectedException.type === 'low_score') && ( -
-
-

- - 一键匹配 (多源) -

-

- 尝试使用其他元数据服务提供商重新指纹匹配。 -

-
- - -
-
- -
-

- - 手动编辑元数据 (含资产) -

- -
-
- - 上传封面 -
- - 拖拽 JPG/PNG -
- 或点击浏览 -
-
-
- -
- - - -
-
- -
- -