Files
MyTool/backend/src/main/java/com/music/service/DedupService.java
2026-01-29 18:26:02 +08:00

455 lines
16 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package com.music.service;
import com.music.dto.ProgressMessage;
import org.jaudiotagger.audio.AudioFile;
import org.jaudiotagger.audio.AudioFileIO;
import org.jaudiotagger.tag.FieldKey;
import org.jaudiotagger.tag.Tag;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 音乐去重服务(首版:仅实现 MD5 去重)
*
* 说明:
* - 目前实现的是基于 MD5 的二进制级别去重,用于识别完全相同的文件拷贝。
* - 元数据匹配与智能评分策略后续迭代中补充。
*/
@Service
public class DedupService {
private static final Logger log = LoggerFactory.getLogger(DedupService.class);
private static final Set<String> AUDIO_EXTENSIONS = new HashSet<>(Arrays.asList(
"mp3", "flac", "wav", "m4a", "aac", "ogg", "wma", "ape", "aiff", "aif", "wv", "tta", "opus"
));
/** 元数据匹配允许的时长误差(秒) */
private static final int DURATION_TOLERANCE_SECONDS = 5;
private final SimpMessagingTemplate messagingTemplate;
private final ProgressStore progressStore;
public DedupService(SimpMessagingTemplate messagingTemplate, ProgressStore progressStore) {
this.messagingTemplate = messagingTemplate;
this.progressStore = progressStore;
}
/**
* 异步执行去重任务
*/
@Async
public void dedup(String taskId,
String libraryDir,
String trashDir,
boolean useMd5,
boolean useMetadata,
String mode) {
Path libraryPath = Paths.get(libraryDir);
Path trashPath = Paths.get(trashDir);
try {
// 基本校验
if (!Files.exists(libraryPath) || !Files.isDirectory(libraryPath)) {
sendProgress(taskId, 0, 0, 0, 0,
"音乐库目录不存在或不是目录", true);
return;
}
if (!Files.exists(trashPath)) {
Files.createDirectories(trashPath);
}
if (!"copy".equalsIgnoreCase(mode) && !"move".equalsIgnoreCase(mode)) {
sendProgress(taskId, 0, 0, 0, 0,
"执行模式错误,必须是 copy 或 move", true);
return;
}
if (!useMd5 && !useMetadata) {
sendProgress(taskId, 0, 0, 0, 0,
"至少需要启用一种去重策略MD5 或元数据匹配)", true);
return;
}
// 收集所有音频文件
List<Path> audioFiles = new ArrayList<>();
Files.walkFileTree(libraryPath, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
if (isAudioFile(file)) {
audioFiles.add(file);
}
return FileVisitResult.CONTINUE;
}
});
int total = audioFiles.size();
if (total == 0) {
sendProgress(taskId, 0, 0, 0, 0,
"未在音乐库中找到音频文件", true);
return;
}
AtomicInteger scanned = new AtomicInteger(0);
AtomicInteger duplicateGroups = new AtomicInteger(0);
AtomicInteger moved = new AtomicInteger(0);
AtomicInteger failed = new AtomicInteger(0);
sendProgress(taskId, total, 0, 0, 0,
"开始扫描音乐库...", false);
Map<String, List<Path>> md5Groups = new HashMap<>();
Map<MetadataKey, List<Path>> metadataGroups = new HashMap<>();
// 第一阶段:扫描并根据配置构建分组
for (Path file : audioFiles) {
try {
if (useMd5) {
String md5 = calculateMd5(file);
md5Groups.computeIfAbsent(md5, k -> new ArrayList<>()).add(file);
}
if (useMetadata) {
Optional<MetadataKey> keyOpt = readMetadataKey(file);
keyOpt.ifPresent(key -> metadataGroups
.computeIfAbsent(key, k -> new ArrayList<>())
.add(file));
}
int currentScanned = scanned.incrementAndGet();
if (currentScanned % 50 == 0) {
sendProgress(taskId, total, currentScanned,
duplicateGroups.get(), moved.get(),
String.format("扫描中(%d/%d", currentScanned, total),
false);
}
} catch (Exception e) {
failed.incrementAndGet();
log.warn("扫描文件失败: {}", file, e);
}
}
// 第二阶段:处理 MD5 去重结果(完全二进制重复)
if (useMd5) {
for (Map.Entry<String, List<Path>> entry : md5Groups.entrySet()) {
List<Path> group = entry.getValue();
if (group.size() <= 1) {
continue;
}
duplicateGroups.incrementAndGet();
Path keep = chooseBestFileByScore(group);
List<Path> duplicates = new ArrayList<>(group);
duplicates.remove(keep);
moved.addAndGet(handleDuplicates(duplicates, keep, trashPath, mode, taskId, total,
scanned, duplicateGroups, failed));
}
}
if (useMetadata) {
// 第三阶段:处理元数据匹配去重结果
for (Map.Entry<MetadataKey, List<Path>> entry : metadataGroups.entrySet()) {
List<Path> group = entry.getValue();
if (group.size() <= 1) {
continue;
}
duplicateGroups.incrementAndGet();
Path keep = chooseBestFileByScore(group);
List<Path> duplicates = new ArrayList<>(group);
duplicates.remove(keep);
moved.addAndGet(handleDuplicates(duplicates, keep, trashPath, mode, taskId, total,
scanned, duplicateGroups, failed));
}
}
sendProgress(taskId, total, scanned.get(),
duplicateGroups.get(), moved.get(),
String.format("任务完成!扫描文件: %d, 重复组: %d, 移动/复制文件: %d",
scanned.get(), duplicateGroups.get(), moved.get()),
true);
} catch (Exception e) {
log.error("去重任务执行失败", e);
sendProgress(taskId, 0, 0, 0, 0,
"任务执行失败: " + e.getMessage(), true);
}
}
private boolean isAudioFile(Path file) {
String name = file.getFileName().toString().toLowerCase();
int idx = name.lastIndexOf('.');
if (idx <= 0 || idx == name.length() - 1) {
return false;
}
String ext = name.substring(idx + 1);
return AUDIO_EXTENSIONS.contains(ext);
}
private String calculateMd5(Path file) throws IOException, NoSuchAlgorithmException {
MessageDigest md = MessageDigest.getInstance("MD5");
try (InputStream is = Files.newInputStream(file);
DigestInputStream dis = new DigestInputStream(is, md)) {
byte[] buffer = new byte[8192];
// 读取整个文件,结果自动更新到 md 中
while (dis.read(buffer) != -1) {
// no-op
}
}
byte[] digest = md.digest();
StringBuilder sb = new StringBuilder(digest.length * 2);
for (byte b : digest) {
sb.append(String.format("%02x", b));
}
return sb.toString();
}
/**
* 元数据分组键:艺术家 + 标题 + 专辑 + 时长(按 5 秒误差归一)
*/
private static class MetadataKey {
private final String artist;
private final String title;
private final String album;
private final int normalizedDuration;
private MetadataKey(String artist, String title, String album, int normalizedDuration) {
this.artist = artist;
this.title = title;
this.album = album;
this.normalizedDuration = normalizedDuration;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof MetadataKey)) return false;
MetadataKey that = (MetadataKey) o;
return normalizedDuration == that.normalizedDuration &&
Objects.equals(artist, that.artist) &&
Objects.equals(title, that.title) &&
Objects.equals(album, that.album);
}
@Override
public int hashCode() {
return Objects.hash(artist, title, album, normalizedDuration);
}
}
/**
* 从音频文件读取用于匹配的元数据键
*/
private Optional<MetadataKey> readMetadataKey(Path file) {
try {
AudioFile audioFile = AudioFileIO.read(file.toFile());
Tag tag = audioFile.getTag();
if (tag == null) {
return Optional.empty();
}
String artist = normalize(tag.getFirst(FieldKey.ARTIST));
String title = normalize(tag.getFirst(FieldKey.TITLE));
String album = normalize(tag.getFirst(FieldKey.ALBUM));
int lengthSec = audioFile.getAudioHeader().getTrackLength();
if (artist.isEmpty() || title.isEmpty()) {
// 核心标签缺失则跳过元数据分组
return Optional.empty();
}
// 将时长按 5 秒误差容忍度归一化
int normalizedDuration = lengthSec / DURATION_TOLERANCE_SECONDS;
return Optional.of(new MetadataKey(artist, title, album, normalizedDuration));
} catch (Exception e) {
// 标签损坏或不支持的格式时,忽略元数据去重
log.debug("读取元数据失败: {}", file, e);
return Optional.empty();
}
}
private String normalize(String s) {
if (s == null) {
return "";
}
return s.trim().toLowerCase();
}
/**
* 对一组候选文件进行综合评分,选择最佳保留文件
*
* 评分策略:
* - 格式优先FLAC > 其他无损 > 有损
* - 码率优先:高码率得分更高(如果可获取)
* - 文件大小:极小文件减分
* - 文件名噪声惩罚:含样本/preview 等噪声词减分
*/
private Path chooseBestFileByScore(List<Path> candidates) {
if (candidates.size() == 1) {
return candidates.get(0);
}
return candidates.stream()
.max(Comparator.comparingDouble(this::scoreFile))
.orElse(candidates.get(0));
}
private double scoreFile(Path file) {
double score = 0.0;
String name = file.getFileName().toString().toLowerCase();
String ext = "";
int idx = name.lastIndexOf('.');
if (idx > 0 && idx < name.length() - 1) {
ext = name.substring(idx + 1);
}
// 格式权重
if ("flac".equals(ext)) {
score += 100;
} else if (Arrays.asList("wav", "ape", "aiff", "aif", "wv", "tta").contains(ext)) {
score += 80;
} else {
score += 50; // 有损格式
}
// 文件大小KB加权更大的通常音质更好但极大文件不再线性加分
try {
long size = Files.size(file);
double sizeKB = size / 1024.0;
if (sizeKB < 128) {
score -= 30; // 极小文件,疑似样本/损坏
} else {
score += Math.min(sizeKB / 100.0, 40.0);
}
} catch (IOException e) {
// 忽略大小获取失败
}
// 文件名噪声惩罚
if (name.contains("sample") || name.contains("preview") || name.contains("demo")) {
score -= 20;
}
if (name.matches(".*\\b(live|remix|karaoke)\\b.*")) {
// 某些版本可能不是首选,略微扣分(具体偏好可根据需要调整)
score -= 5;
}
// TODO如有需要可从音频头中读取比特率进一步加权
return score;
}
/**
* 将重复文件移动/复制到回收站,并更新统计与进度
*
* @return 实际成功移动/复制的文件数量
*/
private int handleDuplicates(List<Path> duplicates,
Path keep,
Path trashPath,
String mode,
String taskId,
int total,
AtomicInteger scanned,
AtomicInteger duplicateGroups,
AtomicInteger failed) {
int movedCount = 0;
for (Path dup : duplicates) {
try {
Path target = resolveTargetFile(trashPath, dup.getFileName().toString());
if ("move".equalsIgnoreCase(mode)) {
Files.move(dup, target, StandardCopyOption.REPLACE_EXISTING);
} else {
Files.copy(dup, target, StandardCopyOption.REPLACE_EXISTING);
}
movedCount++;
sendProgress(taskId, total, scanned.get(),
duplicateGroups.get(), movedCount,
String.format("重复文件: %s (保留: %s)",
dup.getFileName(), keep.getFileName()),
false);
} catch (Exception e) {
failed.incrementAndGet();
log.warn("处理重复文件失败: {}", dup, e);
}
}
return movedCount;
}
/**
* 解析回收站中的目标文件名,处理重名冲突
*/
private Path resolveTargetFile(Path targetDir, String fileName) throws IOException {
Path target = targetDir.resolve(fileName);
if (!Files.exists(target)) {
return target;
}
int lastDot = fileName.lastIndexOf('.');
String base = lastDot > 0 ? fileName.substring(0, lastDot) : fileName;
String ext = lastDot > 0 ? fileName.substring(lastDot) : "";
int n = 1;
while (Files.exists(target)) {
String next = base + " (" + n + ")" + ext;
target = targetDir.resolve(next);
n++;
}
return target;
}
/**
* 发送进度消息
*
* 字段语义(供前端展示用):
* - total扫描到的音频文件总数
* - processed已扫描文件数
* - success重复组数量
* - failed移动/复制的重复文件数量
*
* 由于进度字段在不同任务中的含义略有差异,前端可根据 type === "dedup" 做专门映射。
*/
private void sendProgress(String taskId,
int total,
int processed,
int success,
int failed,
String message,
boolean completed) {
ProgressMessage pm = new ProgressMessage();
pm.setTaskId(taskId);
pm.setType("dedup");
pm.setTotal(total);
pm.setProcessed(processed);
pm.setSuccess(success);
pm.setFailed(failed);
pm.setCurrentFile(null);
pm.setMessage(message);
pm.setCompleted(completed);
progressStore.put(pm);
messagingTemplate.convertAndSend("/topic/progress/" + taskId, pm);
}
}