1063 lines
35 KiB
Python
1063 lines
35 KiB
Python
import os
|
|
import re
|
|
import shutil
|
|
import time
|
|
import unicodedata
|
|
from pathlib import Path
|
|
|
|
from . import library_index
|
|
from .scanner import ALLOWED_AUDIO_EXTENSIONS
|
|
from .task_constants import (
|
|
DEDUPE_PROGRESS_BATCH_SIZE,
|
|
DEDUPE_PROGRESS_INTERVAL_SECONDS,
|
|
ORGANIZE_PROGRESS_BATCH_SIZE,
|
|
ORGANIZE_PROGRESS_INTERVAL_SECONDS,
|
|
TASK_STATUS_RUNNING
|
|
)
|
|
|
|
|
|
LOSSLESS_EXTENSIONS = library_index.LOSSLESS_EXTENSIONS
|
|
PRESERVED_VERSION_TOKENS = library_index.PRESERVED_VERSION_TOKENS
|
|
REPLACE_SCORE_THRESHOLD = 15.0
|
|
MAX_PATH_COMPONENT_LENGTH = 96
|
|
|
|
|
|
class DedupeItemError(Exception):
|
|
def __init__(self, reason: str, message: str):
|
|
super().__init__(message)
|
|
self.reason = reason
|
|
self.message = message
|
|
|
|
|
|
class OrganizeItemError(Exception):
|
|
def __init__(self, reason: str, message: str):
|
|
super().__init__(message)
|
|
self.reason = reason
|
|
self.message = message
|
|
|
|
|
|
class DedupeRunner:
|
|
def __init__(self, task_store, preprocessor, task_stream):
|
|
self.task_store = task_store
|
|
self.preprocessor = preprocessor
|
|
self.task_stream = task_stream
|
|
|
|
def run(self, task_id: str, current_stats: dict, config_snapshot: dict):
|
|
dedupe_stats = current_stats['dedupe'].copy()
|
|
candidates = self.task_store.list_dedupe_candidate_items(task_id)
|
|
dedupe_stats['input_items'] = len(candidates)
|
|
current_stats['dedupe'] = dedupe_stats.copy()
|
|
self._persist_progress(task_id, current_stats, dedupe_stats)
|
|
|
|
if not candidates:
|
|
return
|
|
|
|
library_index = self._index_library_files(config_snapshot['output'])
|
|
dedupe_stats['library_candidates'] = library_index['count']
|
|
current_stats['dedupe'] = dedupe_stats.copy()
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.library_indexed',
|
|
'dedupe',
|
|
{'count': library_index['count']}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='info',
|
|
event_type='dedupe.library_indexed',
|
|
message=f'已索引输出库音频: {library_index["count"]} 个',
|
|
payload={'count': library_index['count']}
|
|
)
|
|
|
|
groups = _group_batch_candidates(candidates)
|
|
winners: list[dict] = []
|
|
processed_count = 0
|
|
last_progress_at = time.monotonic()
|
|
|
|
for group in groups:
|
|
running_items = [
|
|
self.task_store.update_task_item(
|
|
item['id'],
|
|
dedupe_status='running',
|
|
dedupe_reason=None,
|
|
dedupe_message=None,
|
|
dedupe_group_key=None,
|
|
duplicate_of_path=None,
|
|
duplicate_of_item_id=None,
|
|
dedupe_decision_json=None
|
|
)
|
|
for item in group
|
|
]
|
|
winner, batch_duplicates, identity_basis, group_key = _select_batch_winner(running_items)
|
|
|
|
for duplicate_item in batch_duplicates:
|
|
try:
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.lookup_started',
|
|
'dedupe',
|
|
{'item': duplicate_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='info',
|
|
event_type='dedupe.lookup_started',
|
|
message=f'开始比对重复项: {duplicate_item["relative_path"]}',
|
|
payload={'item': duplicate_item}
|
|
)
|
|
dedupe_stats['batch_duplicates'] += 1
|
|
trashed_path = self._move_file_to_trash(
|
|
config_snapshot['trash'],
|
|
'duplicates',
|
|
task_id,
|
|
duplicate_item['id'],
|
|
duplicate_item['current_file_path']
|
|
)
|
|
final_item = self.task_store.update_task_item(
|
|
duplicate_item['id'],
|
|
is_active=0,
|
|
current_file_path=trashed_path,
|
|
trash_file_path=trashed_path,
|
|
dedupe_status='duplicate_trashed',
|
|
dedupe_reason='batch_duplicate',
|
|
dedupe_message='当前批次中存在更优文件,已移入回收站',
|
|
dedupe_group_key=group_key,
|
|
duplicate_of_path=winner['current_file_path'],
|
|
duplicate_of_item_id=winner['id'],
|
|
dedupe_decision_json={
|
|
'comparison_scope': 'batch',
|
|
'identity_basis': identity_basis,
|
|
'quality_breakdown': {
|
|
'kept': _build_quality_breakdown(winner),
|
|
'trashed': _build_quality_breakdown(duplicate_item)
|
|
},
|
|
'kept_side': 'batch',
|
|
'trashed_path': trashed_path,
|
|
'replaced_existing_path': None,
|
|
'compared_candidates': [
|
|
_serialize_compared_candidate('kept', winner),
|
|
_serialize_compared_candidate('trashed', duplicate_item)
|
|
]
|
|
}
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_duplicate',
|
|
'dedupe',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='warning',
|
|
event_type='dedupe.item_duplicate',
|
|
message=f'批次重复已淘汰: {duplicate_item["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
except DedupeItemError as error:
|
|
dedupe_stats['failed_items'] += 1
|
|
final_item = self.task_store.update_task_item(
|
|
duplicate_item['id'],
|
|
dedupe_status='failed',
|
|
dedupe_reason=error.reason,
|
|
dedupe_message=error.message,
|
|
dedupe_group_key=group_key,
|
|
duplicate_of_path=winner['current_file_path'],
|
|
duplicate_of_item_id=winner['id']
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_failed',
|
|
'dedupe',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='error',
|
|
event_type='dedupe.item_failed',
|
|
message=f'重复检测失败: {duplicate_item["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
processed_count += 1
|
|
last_progress_at = self._maybe_persist_progress(
|
|
task_id,
|
|
current_stats,
|
|
dedupe_stats,
|
|
processed_count,
|
|
last_progress_at
|
|
)
|
|
|
|
winners.append(
|
|
self.task_store.update_task_item(
|
|
winner['id'],
|
|
dedupe_status='running',
|
|
dedupe_group_key=group_key
|
|
)
|
|
)
|
|
|
|
replace_enabled = bool(
|
|
(config_snapshot.get('advancedStrategy') or {}).get('replaceLowQualityDuplicates')
|
|
)
|
|
|
|
for winner in winners:
|
|
if not winner['is_active']:
|
|
continue
|
|
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.lookup_started',
|
|
'dedupe',
|
|
{'item': winner}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='info',
|
|
event_type='dedupe.lookup_started',
|
|
message=f'开始比对重复项: {winner["relative_path"]}',
|
|
payload={'item': winner}
|
|
)
|
|
identity_basis, identity_key = _choose_primary_identity(winner)
|
|
if not identity_basis or not identity_key:
|
|
unique_item = self.task_store.update_task_item(
|
|
winner['id'],
|
|
dedupe_status='unique',
|
|
dedupe_reason=None,
|
|
dedupe_message='未发现重复项',
|
|
dedupe_group_key=winner.get('dedupe_group_key') or f'item:{winner["id"]}',
|
|
dedupe_decision_json={
|
|
'comparison_scope': 'none',
|
|
'identity_basis': None,
|
|
'quality_breakdown': {'kept': _build_quality_breakdown(winner)},
|
|
'kept_side': 'batch',
|
|
'trashed_path': None,
|
|
'replaced_existing_path': None,
|
|
'compared_candidates': [_serialize_compared_candidate('kept', winner)]
|
|
}
|
|
)
|
|
dedupe_stats['kept_items'] += 1
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_unique',
|
|
'dedupe',
|
|
{'item': unique_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='success',
|
|
event_type='dedupe.item_unique',
|
|
message=f'未发现重复项,保留文件: {winner["relative_path"]}',
|
|
payload={'item': unique_item}
|
|
)
|
|
processed_count += 1
|
|
last_progress_at = self._maybe_persist_progress(
|
|
task_id,
|
|
current_stats,
|
|
dedupe_stats,
|
|
processed_count,
|
|
last_progress_at
|
|
)
|
|
continue
|
|
|
|
library_candidates = library_index['by_basis'].get(identity_basis, {}).get(identity_key, [])
|
|
if not library_candidates:
|
|
unique_item = self.task_store.update_task_item(
|
|
winner['id'],
|
|
dedupe_status='unique',
|
|
dedupe_reason=None,
|
|
dedupe_message='未发现库内重复项',
|
|
dedupe_group_key=winner.get('dedupe_group_key') or identity_key,
|
|
dedupe_decision_json={
|
|
'comparison_scope': 'library',
|
|
'identity_basis': identity_basis,
|
|
'quality_breakdown': {'kept': _build_quality_breakdown(winner)},
|
|
'kept_side': 'batch',
|
|
'trashed_path': None,
|
|
'replaced_existing_path': None,
|
|
'compared_candidates': [_serialize_compared_candidate('kept', winner)]
|
|
}
|
|
)
|
|
dedupe_stats['kept_items'] += 1
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_unique',
|
|
'dedupe',
|
|
{'item': unique_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='success',
|
|
event_type='dedupe.item_unique',
|
|
message=f'未发现库内重复项,保留文件: {winner["relative_path"]}',
|
|
payload={'item': unique_item}
|
|
)
|
|
processed_count += 1
|
|
last_progress_at = self._maybe_persist_progress(
|
|
task_id,
|
|
current_stats,
|
|
dedupe_stats,
|
|
processed_count,
|
|
last_progress_at
|
|
)
|
|
continue
|
|
|
|
library_item = max(library_candidates, key=lambda candidate: _build_quality_breakdown(candidate)['total'])
|
|
dedupe_stats['library_duplicates'] += 1
|
|
winner_quality = _build_quality_breakdown(winner)
|
|
library_quality = _build_quality_breakdown(library_item)
|
|
|
|
try:
|
|
if replace_enabled and winner_quality['total'] >= library_quality['total'] + REPLACE_SCORE_THRESHOLD:
|
|
replaced_path = self._move_file_to_trash(
|
|
config_snapshot['trash'],
|
|
'duplicates',
|
|
task_id,
|
|
winner['id'],
|
|
library_item['file_path']
|
|
)
|
|
final_item = self.task_store.update_task_item(
|
|
winner['id'],
|
|
dedupe_status='duplicate_replaced',
|
|
dedupe_reason='replaced_library_duplicate',
|
|
dedupe_message='当前文件质量明显更高,已替换库内旧文件',
|
|
dedupe_group_key=winner.get('dedupe_group_key') or identity_key,
|
|
duplicate_of_path=library_item['file_path'],
|
|
duplicate_of_item_id=None,
|
|
dedupe_decision_json={
|
|
'comparison_scope': 'library',
|
|
'identity_basis': identity_basis,
|
|
'quality_breakdown': {
|
|
'kept': winner_quality,
|
|
'replaced': library_quality
|
|
},
|
|
'kept_side': 'batch',
|
|
'trashed_path': replaced_path,
|
|
'replaced_existing_path': library_item['file_path'],
|
|
'compared_candidates': [
|
|
_serialize_compared_candidate('kept', winner),
|
|
_serialize_compared_candidate('replaced', library_item)
|
|
]
|
|
}
|
|
)
|
|
dedupe_stats['replaced_library_items'] += 1
|
|
dedupe_stats['kept_items'] += 1
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_replaced',
|
|
'dedupe',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='success',
|
|
event_type='dedupe.item_replaced',
|
|
message=f'已替换库内旧文件: {winner["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
else:
|
|
trashed_path = self._move_file_to_trash(
|
|
config_snapshot['trash'],
|
|
'duplicates',
|
|
task_id,
|
|
winner['id'],
|
|
winner['current_file_path']
|
|
)
|
|
final_item = self.task_store.update_task_item(
|
|
winner['id'],
|
|
is_active=0,
|
|
current_file_path=trashed_path,
|
|
trash_file_path=trashed_path,
|
|
dedupe_status='duplicate_trashed',
|
|
dedupe_reason='library_duplicate',
|
|
dedupe_message='输出库中已存在重复文件,保留库内文件',
|
|
dedupe_group_key=winner.get('dedupe_group_key') or identity_key,
|
|
duplicate_of_path=library_item['file_path'],
|
|
duplicate_of_item_id=None,
|
|
dedupe_decision_json={
|
|
'comparison_scope': 'library',
|
|
'identity_basis': identity_basis,
|
|
'quality_breakdown': {
|
|
'kept': library_quality,
|
|
'trashed': winner_quality
|
|
},
|
|
'kept_side': 'library',
|
|
'trashed_path': trashed_path,
|
|
'replaced_existing_path': None,
|
|
'compared_candidates': [
|
|
_serialize_compared_candidate('kept', library_item),
|
|
_serialize_compared_candidate('trashed', winner)
|
|
]
|
|
}
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_duplicate',
|
|
'dedupe',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='warning',
|
|
event_type='dedupe.item_duplicate',
|
|
message=f'输出库已存在重复文件,已淘汰: {winner["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
except DedupeItemError as error:
|
|
dedupe_stats['failed_items'] += 1
|
|
final_item = self.task_store.update_task_item(
|
|
winner['id'],
|
|
dedupe_status='failed',
|
|
dedupe_reason=error.reason,
|
|
dedupe_message=error.message,
|
|
dedupe_group_key=winner.get('dedupe_group_key') or identity_key,
|
|
duplicate_of_path=library_item['file_path']
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.item_failed',
|
|
'dedupe',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='error',
|
|
event_type='dedupe.item_failed',
|
|
message=f'重复检测失败: {winner["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
|
|
processed_count += 1
|
|
last_progress_at = self._maybe_persist_progress(
|
|
task_id,
|
|
current_stats,
|
|
dedupe_stats,
|
|
processed_count,
|
|
last_progress_at
|
|
)
|
|
|
|
self._persist_progress(task_id, current_stats, dedupe_stats)
|
|
|
|
def _index_library_files(self, output_dir: str) -> dict:
|
|
return library_index.build_library_index(
|
|
output_dir,
|
|
probe_audio=self._safe_probe_audio,
|
|
read_tags=self._safe_read_library_tags
|
|
)
|
|
|
|
def _safe_probe_audio(self, file_path: str) -> dict:
|
|
try:
|
|
return self.preprocessor.probe_audio(file_path)
|
|
except Exception:
|
|
return {}
|
|
|
|
def _safe_read_library_tags(self, file_path: str) -> dict:
|
|
return library_index.safe_read_tags(library_index.default_read_library_tags, file_path)
|
|
|
|
def _move_file_to_trash(
|
|
self,
|
|
trash_root: str,
|
|
reason: str,
|
|
task_id: str,
|
|
item_id: int | None,
|
|
source_path: str
|
|
) -> str:
|
|
source = Path(source_path)
|
|
if not source.exists():
|
|
raise DedupeItemError('source_missing', f'源文件不存在: {source}')
|
|
destination = _build_unique_destination(
|
|
Path(trash_root) / reason / task_id,
|
|
_build_prefixed_name(item_id, source.name)
|
|
)
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
self._move_file(source, destination)
|
|
except OSError as error:
|
|
raise DedupeItemError('trash_move_failed', f'移动到回收站失败: {error}') from error
|
|
return str(destination.resolve(strict=False))
|
|
|
|
def _move_file(self, source: Path, destination: Path):
|
|
shutil.move(str(source), str(destination))
|
|
|
|
def _persist_progress(self, task_id: str, current_stats: dict, dedupe_stats: dict[str, int]):
|
|
current_stats['dedupe'] = dedupe_stats.copy()
|
|
self.task_store.update_task(
|
|
task_id,
|
|
status=TASK_STATUS_RUNNING,
|
|
current_stage='dedupe',
|
|
stats=current_stats
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'dedupe.progress',
|
|
'dedupe',
|
|
{'stats': current_stats}
|
|
)
|
|
|
|
def _append_log(
|
|
self,
|
|
task_id: str,
|
|
*,
|
|
level: str,
|
|
event_type: str,
|
|
message: str,
|
|
payload: dict | None = None
|
|
):
|
|
persisted_log = self.task_store.append_log(
|
|
task_id,
|
|
'dedupe',
|
|
level,
|
|
event_type,
|
|
message,
|
|
payload
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'log.appended',
|
|
'dedupe',
|
|
{'log': persisted_log}
|
|
)
|
|
|
|
def _maybe_persist_progress(
|
|
self,
|
|
task_id: str,
|
|
current_stats: dict,
|
|
dedupe_stats: dict[str, int],
|
|
processed_count: int,
|
|
last_progress_at: float
|
|
) -> float:
|
|
now = time.monotonic()
|
|
if (
|
|
processed_count % DEDUPE_PROGRESS_BATCH_SIZE == 0
|
|
or now - last_progress_at >= DEDUPE_PROGRESS_INTERVAL_SECONDS
|
|
):
|
|
self._persist_progress(task_id, current_stats, dedupe_stats)
|
|
return now
|
|
return last_progress_at
|
|
|
|
|
|
class OrganizeRunner:
|
|
def __init__(self, task_store, task_stream):
|
|
self.task_store = task_store
|
|
self.task_stream = task_stream
|
|
|
|
def run(self, task_id: str, current_stats: dict, config_snapshot: dict):
|
|
organize_stats = current_stats['organize'].copy()
|
|
candidates = self.task_store.list_organize_candidate_items(task_id)
|
|
organize_stats['input_items'] = len(candidates)
|
|
current_stats['organize'] = organize_stats.copy()
|
|
self._persist_progress(task_id, current_stats, organize_stats)
|
|
|
|
if not candidates:
|
|
return
|
|
|
|
output_root = Path(config_snapshot['output']).expanduser().resolve(strict=False)
|
|
trash_root = Path(config_snapshot['trash']).expanduser().resolve(strict=False)
|
|
processed_count = 0
|
|
last_progress_at = time.monotonic()
|
|
|
|
for original_item in candidates:
|
|
item = self.task_store.update_task_item(
|
|
original_item['id'],
|
|
organize_status='running',
|
|
organize_reason=None,
|
|
organize_message=None,
|
|
library_relative_path=None,
|
|
library_file_path=None,
|
|
organize_decision_json=None
|
|
)
|
|
|
|
try:
|
|
plan = _build_organize_plan(output_root, item)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.path_planned',
|
|
'organize',
|
|
{
|
|
'item': item,
|
|
'planned_relative_path': plan['planned_relative_path']
|
|
}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='info',
|
|
event_type='organize.path_planned',
|
|
message=f'已规划入库路径: {item["relative_path"]}',
|
|
payload={
|
|
'item': item,
|
|
'planned_relative_path': plan['planned_relative_path']
|
|
}
|
|
)
|
|
final_path, collision_count = self._resolve_destination(
|
|
output_root / plan['planned_relative_path'],
|
|
Path(item['current_file_path'])
|
|
)
|
|
final_path.parent.mkdir(parents=True, exist_ok=True)
|
|
source_path = Path(item['current_file_path'])
|
|
if not source_path.exists():
|
|
raise OrganizeItemError('source_missing', f'源文件不存在: {source_path}')
|
|
self._move_file(source_path, final_path)
|
|
final_relative_path = final_path.relative_to(output_root).as_posix()
|
|
renamed = Path(item['current_file_path']).name != final_path.name
|
|
moved = Path(item['current_file_path']).resolve(strict=False) != final_path.resolve(strict=False)
|
|
|
|
if moved:
|
|
organize_stats['moved_items'] += 1
|
|
if renamed:
|
|
organize_stats['renamed_items'] += 1
|
|
if collision_count > 1:
|
|
organize_stats['collision_resolved'] += 1
|
|
|
|
final_item = self.task_store.update_task_item(
|
|
item['id'],
|
|
current_file_path=str(final_path.resolve(strict=False)),
|
|
filename=final_path.name,
|
|
organize_status='organized',
|
|
organize_reason=None,
|
|
organize_message='已按标准路径入库',
|
|
library_relative_path=final_relative_path,
|
|
library_file_path=str(final_path.resolve(strict=False)),
|
|
organize_decision_json={
|
|
'source_path': item['current_file_path'],
|
|
'planned_relative_path': plan['planned_relative_path'],
|
|
'final_relative_path': final_relative_path,
|
|
'collision_strategy': 'suffix' if collision_count > 1 else 'none',
|
|
'trashed_on_failure': None,
|
|
'final_action': 'organized'
|
|
}
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.item_organized',
|
|
'organize',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='success',
|
|
event_type='organize.item_organized',
|
|
message=f'文件已入库: {final_relative_path}',
|
|
payload={'item': final_item}
|
|
)
|
|
except OrganizeItemError as error:
|
|
organize_stats['failed_items'] += 1
|
|
final_item = self._handle_failure(task_id, item, error, output_root, trash_root)
|
|
if final_item['organize_status'] == 'trashed':
|
|
organize_stats['trashed_items'] += 1
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.item_trashed',
|
|
'organize',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='warning',
|
|
event_type='organize.item_trashed',
|
|
message=f'入库失败后已移入回收站: {item["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
else:
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.item_failed',
|
|
'organize',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='error',
|
|
event_type='organize.item_failed',
|
|
message=f'整理入库失败: {item["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
except OSError as error:
|
|
organize_stats['failed_items'] += 1
|
|
final_item = self._handle_failure(
|
|
task_id,
|
|
item,
|
|
OrganizeItemError('move_failed', f'整理入库失败: {error}'),
|
|
output_root,
|
|
trash_root
|
|
)
|
|
if final_item['organize_status'] == 'trashed':
|
|
organize_stats['trashed_items'] += 1
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.item_trashed',
|
|
'organize',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='warning',
|
|
event_type='organize.item_trashed',
|
|
message=f'入库失败后已移入回收站: {item["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
else:
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.item_failed',
|
|
'organize',
|
|
{'item': final_item}
|
|
)
|
|
self._append_log(
|
|
task_id,
|
|
level='error',
|
|
event_type='organize.item_failed',
|
|
message=f'整理入库失败: {item["relative_path"]}',
|
|
payload={'item': final_item}
|
|
)
|
|
|
|
processed_count += 1
|
|
last_progress_at = self._maybe_persist_progress(
|
|
task_id,
|
|
current_stats,
|
|
organize_stats,
|
|
processed_count,
|
|
last_progress_at
|
|
)
|
|
|
|
self._persist_progress(task_id, current_stats, organize_stats)
|
|
|
|
def _handle_failure(
|
|
self,
|
|
task_id: str,
|
|
item: dict,
|
|
error: OrganizeItemError,
|
|
output_root: Path,
|
|
trash_root: Path
|
|
) -> dict:
|
|
source_path = Path(item['current_file_path'])
|
|
trashed_path = None
|
|
final_status = 'failed'
|
|
message = error.message
|
|
|
|
if source_path.exists():
|
|
destination = _build_unique_destination(
|
|
trash_root / 'organize_failed' / task_id,
|
|
_build_prefixed_name(item['id'], source_path.name)
|
|
)
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
self._move_file(source_path, destination)
|
|
trashed_path = str(destination.resolve(strict=False))
|
|
final_status = 'trashed'
|
|
except OSError as trash_error:
|
|
message = f'{error.message}; 移入回收站失败: {trash_error}'
|
|
|
|
return self.task_store.update_task_item(
|
|
item['id'],
|
|
current_file_path=trashed_path or item['current_file_path'],
|
|
trash_file_path=trashed_path,
|
|
organize_status=final_status,
|
|
organize_reason=error.reason,
|
|
organize_message=message,
|
|
organize_decision_json={
|
|
'source_path': item['current_file_path'],
|
|
'planned_relative_path': None,
|
|
'final_relative_path': None,
|
|
'collision_strategy': 'none',
|
|
'trashed_on_failure': trashed_path,
|
|
'final_action': final_status
|
|
}
|
|
)
|
|
|
|
def _resolve_destination(self, desired_path: Path, source_path: Path) -> tuple[Path, int]:
|
|
candidate = desired_path
|
|
collision_index = 1
|
|
|
|
while candidate.exists():
|
|
if candidate.resolve(strict=False) == source_path.resolve(strict=False):
|
|
return candidate, collision_index
|
|
collision_index += 1
|
|
candidate = candidate.with_name(
|
|
f'{desired_path.stem} ({collision_index}){desired_path.suffix}'
|
|
)
|
|
|
|
return candidate, collision_index
|
|
|
|
def _move_file(self, source: Path, destination: Path):
|
|
shutil.move(str(source), str(destination))
|
|
|
|
def _persist_progress(self, task_id: str, current_stats: dict, organize_stats: dict[str, int]):
|
|
current_stats['organize'] = organize_stats.copy()
|
|
self.task_store.update_task(
|
|
task_id,
|
|
status=TASK_STATUS_RUNNING,
|
|
current_stage='organize',
|
|
stats=current_stats
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'organize.progress',
|
|
'organize',
|
|
{'stats': current_stats}
|
|
)
|
|
|
|
def _append_log(
|
|
self,
|
|
task_id: str,
|
|
*,
|
|
level: str,
|
|
event_type: str,
|
|
message: str,
|
|
payload: dict | None = None
|
|
):
|
|
persisted_log = self.task_store.append_log(
|
|
task_id,
|
|
'organize',
|
|
level,
|
|
event_type,
|
|
message,
|
|
payload
|
|
)
|
|
self.task_stream.broadcast_event(
|
|
task_id,
|
|
'log.appended',
|
|
'organize',
|
|
{'log': persisted_log}
|
|
)
|
|
|
|
def _maybe_persist_progress(
|
|
self,
|
|
task_id: str,
|
|
current_stats: dict,
|
|
organize_stats: dict[str, int],
|
|
processed_count: int,
|
|
last_progress_at: float
|
|
) -> float:
|
|
now = time.monotonic()
|
|
if (
|
|
processed_count % ORGANIZE_PROGRESS_BATCH_SIZE == 0
|
|
or now - last_progress_at >= ORGANIZE_PROGRESS_INTERVAL_SECONDS
|
|
):
|
|
self._persist_progress(task_id, current_stats, organize_stats)
|
|
return now
|
|
return last_progress_at
|
|
|
|
|
|
def _group_batch_candidates(items: list[dict]) -> list[list[dict]]:
|
|
groups: list[list[dict]] = []
|
|
indexed_groups: dict[tuple[str, str], list[dict]] = {}
|
|
|
|
for item in items:
|
|
identity_keys = _identity_keys_for_item(item, include_fingerprint=True)
|
|
target_group = None
|
|
for identity_basis, identity_key in identity_keys:
|
|
target_group = indexed_groups.get((identity_basis, identity_key))
|
|
if target_group is not None:
|
|
break
|
|
if target_group is None:
|
|
target_group = [item]
|
|
groups.append(target_group)
|
|
if identity_keys:
|
|
for identity_basis, identity_key in identity_keys:
|
|
indexed_groups.setdefault((identity_basis, identity_key), target_group)
|
|
else:
|
|
indexed_groups[(f'item:{item["id"]}', str(item['id']))] = target_group
|
|
else:
|
|
target_group.append(item)
|
|
|
|
return groups
|
|
|
|
|
|
def _select_batch_winner(group: list[dict]) -> tuple[dict, list[dict], str | None, str]:
|
|
winner = group[0]
|
|
for candidate in group[1:]:
|
|
if _compare_batch_candidates(candidate, winner) < 0:
|
|
winner = candidate
|
|
identity_basis, identity_key = _choose_primary_identity(winner)
|
|
return winner, [item for item in group if item['id'] != winner['id']], identity_basis, identity_key or f'item:{winner["id"]}'
|
|
|
|
|
|
def _compare_batch_candidates(left: dict, right: dict) -> int:
|
|
left_quality = _build_quality_breakdown(left)
|
|
right_quality = _build_quality_breakdown(right)
|
|
if left_quality['total'] != right_quality['total']:
|
|
return -1 if left_quality['total'] > right_quality['total'] else 1
|
|
|
|
left_authority = (1 if left.get('match_is_authoritative') else 0, left.get('match_confidence') or 0)
|
|
right_authority = (1 if right.get('match_is_authoritative') else 0, right.get('match_confidence') or 0)
|
|
if left_authority != right_authority:
|
|
return -1 if left_authority > right_authority else 1
|
|
|
|
left_created = left.get('created_at') or ''
|
|
right_created = right.get('created_at') or ''
|
|
if left_created != right_created:
|
|
return -1 if left_created < right_created else 1
|
|
if left['id'] != right['id']:
|
|
return -1 if left['id'] < right['id'] else 1
|
|
return 0
|
|
|
|
|
|
def _build_library_metadata(tags: dict, audio_props: dict, file_path: Path) -> dict:
|
|
return library_index.build_library_metadata(tags, audio_props, file_path)
|
|
|
|
|
|
def _identity_keys_for_item(item: dict, *, include_fingerprint: bool) -> list[tuple[str, str]]:
|
|
return library_index.identity_keys_for_item(item, include_fingerprint=include_fingerprint)
|
|
|
|
|
|
def _choose_primary_identity(item: dict) -> tuple[str | None, str | None]:
|
|
return library_index.choose_primary_identity(item)
|
|
|
|
|
|
def _build_quality_breakdown(item: dict) -> dict:
|
|
audio_props = item.get('audio_props_json') or {}
|
|
duration_seconds = _first_non_empty(
|
|
(item.get('matched_metadata_json') or {}).get('duration_seconds'),
|
|
item.get('fingerprint_duration_seconds'),
|
|
audio_props.get('duration_seconds')
|
|
)
|
|
bit_depth = _safe_float(audio_props.get('bit_depth'))
|
|
sample_rate = _safe_float(audio_props.get('sample_rate'))
|
|
bitrate = _safe_float(audio_props.get('bitrate'))
|
|
channels = _safe_float(audio_props.get('channels'))
|
|
size_bytes = _safe_float(item.get('size_bytes'))
|
|
extension = (item.get('extension') or Path(item.get('current_file_path') or '').suffix).lower()
|
|
is_lossless = extension in LOSSLESS_EXTENSIONS or str(audio_props.get('codec') or '').upper() == 'FLAC'
|
|
|
|
breakdown = {
|
|
'bit_depth': round(min((bit_depth or 0) / 24.0, 1.0) * 30.0, 2),
|
|
'sample_rate': round(min((sample_rate or 0) / 96000.0, 1.0) * 20.0, 2),
|
|
'bitrate': round(min((bitrate or 0) / 320000.0, 1.0) * 20.0, 2),
|
|
'lossless': 15.0 if is_lossless else 0.0,
|
|
'channels': round(min((channels or 0) / 2.0, 1.0) * 5.0, 2),
|
|
'size_duration_consistency': 0.0,
|
|
'match_quality': round(
|
|
(3.0 if item.get('match_is_authoritative') else 0.0)
|
|
+ min((_safe_float(item.get('match_confidence')) or 0.0) / 100.0, 1.0) * 2.0,
|
|
2
|
|
)
|
|
}
|
|
if size_bytes and duration_seconds and _safe_float(duration_seconds) and size_bytes / _safe_float(duration_seconds) > 1000:
|
|
breakdown['size_duration_consistency'] = 5.0
|
|
breakdown['total'] = round(sum(value for key, value in breakdown.items() if key != 'total'), 2)
|
|
return breakdown
|
|
|
|
|
|
def _serialize_compared_candidate(side: str, item: dict) -> dict:
|
|
return {
|
|
'side': side,
|
|
'item_id': item.get('id'),
|
|
'path': item.get('file_path') or item.get('current_file_path'),
|
|
'relative_path': item.get('relative_path'),
|
|
'quality_score': _build_quality_breakdown(item)['total']
|
|
}
|
|
|
|
|
|
def _build_organize_plan(output_root: Path, item: dict) -> dict:
|
|
metadata = item.get('matched_metadata_json') or {}
|
|
album_artist = _sanitize_path_component(
|
|
_first_non_empty(metadata.get('album_artist'), metadata.get('artist'), 'Unknown Artist')
|
|
)
|
|
if not album_artist:
|
|
raise OrganizeItemError('invalid_target_path', '无法生成有效的 Album Artist 目录')
|
|
|
|
title = _sanitize_path_component(
|
|
_first_non_empty(metadata.get('title'), Path(item.get('current_file_path') or item['filename']).stem)
|
|
)
|
|
if not title:
|
|
raise OrganizeItemError('invalid_target_path', '无法生成有效的标题文件名')
|
|
|
|
year = _extract_year(_first_non_empty(metadata.get('release_date'), metadata.get('year')))
|
|
track_number = _parse_track_number(metadata.get('track_number')) or 1
|
|
disc_number = _parse_track_number(metadata.get('disc_number'))
|
|
extension = (item.get('extension') or Path(item.get('current_file_path') or '').suffix).lower()
|
|
bucket = _bucket_letter(album_artist)
|
|
album = _sanitize_path_component(metadata.get('album'))
|
|
|
|
filename = f'{track_number:02d} - {title}{extension}'
|
|
if album and album.lower() not in {'single', 'singles'}:
|
|
path_parts = [bucket, album_artist, album]
|
|
if disc_number and disc_number > 1:
|
|
path_parts.append(f'Disc {disc_number}')
|
|
path_parts.append(filename)
|
|
else:
|
|
year_label = str(year) if year else 'Unknown Year'
|
|
single_dir = _sanitize_path_component(f'{year_label} - {title}')
|
|
path_parts = [bucket, album_artist, 'Singles', single_dir, f'01 - {title}{extension}']
|
|
|
|
planned_relative_path = Path(*path_parts).as_posix()
|
|
return {
|
|
'output_root': output_root,
|
|
'planned_relative_path': planned_relative_path
|
|
}
|
|
|
|
|
|
def _build_prefixed_name(item_id: int | None, filename: str) -> str:
|
|
safe_name = _sanitize_path_component(Path(filename).name, fallback='file')
|
|
return f'{item_id}_{safe_name}' if item_id is not None else safe_name
|
|
|
|
|
|
def _build_unique_destination(directory: Path, filename: str) -> Path:
|
|
candidate = directory / filename
|
|
if not candidate.exists():
|
|
return candidate
|
|
stem = candidate.stem
|
|
suffix = candidate.suffix
|
|
counter = 2
|
|
while True:
|
|
next_candidate = candidate.with_name(f'{stem} ({counter}){suffix}')
|
|
if not next_candidate.exists():
|
|
return next_candidate
|
|
counter += 1
|
|
|
|
|
|
def _bucket_letter(value: str) -> str:
|
|
normalized = unicodedata.normalize('NFKC', value).strip()
|
|
if not normalized:
|
|
return '#'
|
|
first = normalized[0].upper()
|
|
return first if first.isalnum() and first.isascii() else '#'
|
|
|
|
|
|
def _sanitize_path_component(value: str | None, fallback: str | None = None) -> str:
|
|
raw = unicodedata.normalize('NFKC', str(value or fallback or '')).strip()
|
|
cleaned = re.sub(r'[\\/:*?"<>|\x00-\x1f]+', ' ', raw)
|
|
cleaned = re.sub(r'\s+', ' ', cleaned).strip().rstrip('. ')
|
|
if not cleaned:
|
|
cleaned = fallback or ''
|
|
if len(cleaned) > MAX_PATH_COMPONENT_LENGTH:
|
|
cleaned = cleaned[:MAX_PATH_COMPONENT_LENGTH].rstrip('. ')
|
|
return cleaned
|
|
|
|
|
|
def _normalize_identity_text(value: str | None) -> str:
|
|
return library_index.normalize_identity_text(value)
|
|
|
|
|
|
def _extract_preserved_version_tokens(value: str | None) -> set[str]:
|
|
return library_index.extract_preserved_version_tokens(value)
|
|
|
|
|
|
def _normalize_tag_key(value: str) -> str:
|
|
return library_index.normalize_tag_key(value)
|
|
|
|
|
|
def _coerce_tag_value(value) -> str | None:
|
|
return library_index.coerce_tag_value(value)
|
|
|
|
|
|
def _parse_track_number(value) -> int | None:
|
|
return library_index.parse_track_number(value)
|
|
|
|
|
|
def _extract_year(value) -> int | None:
|
|
return library_index.extract_year(value)
|
|
|
|
|
|
def _duration_bucket(value) -> int | None:
|
|
return library_index.duration_bucket(value)
|
|
|
|
|
|
def _safe_float(value) -> float | None:
|
|
return library_index.safe_float(value)
|
|
|
|
|
|
def _first_non_empty(*values):
|
|
return library_index.first_non_empty(*values)
|
|
|
|
|
|
def _split_artists(value: str | None) -> list[str]:
|
|
return library_index.split_artists(value)
|