提交代码
This commit is contained in:
454
backend/src/main/java/com/music/service/DedupService.java
Normal file
454
backend/src/main/java/com/music/service/DedupService.java
Normal file
@@ -0,0 +1,454 @@
|
||||
package com.music.service;
|
||||
|
||||
import com.music.dto.ProgressMessage;
|
||||
import org.jaudiotagger.audio.AudioFile;
|
||||
import org.jaudiotagger.audio.AudioFileIO;
|
||||
import org.jaudiotagger.tag.FieldKey;
|
||||
import org.jaudiotagger.tag.Tag;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.messaging.simp.SimpMessagingTemplate;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.*;
|
||||
import java.nio.file.attribute.BasicFileAttributes;
|
||||
import java.security.DigestInputStream;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* 音乐去重服务(首版:仅实现 MD5 去重)
|
||||
*
|
||||
* 说明:
|
||||
* - 目前实现的是基于 MD5 的二进制级别去重,用于识别完全相同的文件拷贝。
|
||||
* - 元数据匹配与智能评分策略后续迭代中补充。
|
||||
*/
|
||||
@Service
|
||||
public class DedupService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DedupService.class);
|
||||
|
||||
private static final Set<String> AUDIO_EXTENSIONS = new HashSet<>(Arrays.asList(
|
||||
"mp3", "flac", "wav", "m4a", "aac", "ogg", "wma", "ape", "aiff", "aif", "wv", "tta", "opus"
|
||||
));
|
||||
|
||||
/** 元数据匹配允许的时长误差(秒) */
|
||||
private static final int DURATION_TOLERANCE_SECONDS = 5;
|
||||
|
||||
private final SimpMessagingTemplate messagingTemplate;
|
||||
private final ProgressStore progressStore;
|
||||
|
||||
public DedupService(SimpMessagingTemplate messagingTemplate, ProgressStore progressStore) {
|
||||
this.messagingTemplate = messagingTemplate;
|
||||
this.progressStore = progressStore;
|
||||
}
|
||||
|
||||
/**
|
||||
* 异步执行去重任务
|
||||
*/
|
||||
@Async
|
||||
public void dedup(String taskId,
|
||||
String libraryDir,
|
||||
String trashDir,
|
||||
boolean useMd5,
|
||||
boolean useMetadata,
|
||||
String mode) {
|
||||
|
||||
Path libraryPath = Paths.get(libraryDir);
|
||||
Path trashPath = Paths.get(trashDir);
|
||||
|
||||
try {
|
||||
// 基本校验
|
||||
if (!Files.exists(libraryPath) || !Files.isDirectory(libraryPath)) {
|
||||
sendProgress(taskId, 0, 0, 0, 0,
|
||||
"音乐库目录不存在或不是目录", true);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!Files.exists(trashPath)) {
|
||||
Files.createDirectories(trashPath);
|
||||
}
|
||||
|
||||
if (!"copy".equalsIgnoreCase(mode) && !"move".equalsIgnoreCase(mode)) {
|
||||
sendProgress(taskId, 0, 0, 0, 0,
|
||||
"执行模式错误,必须是 copy 或 move", true);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!useMd5 && !useMetadata) {
|
||||
sendProgress(taskId, 0, 0, 0, 0,
|
||||
"至少需要启用一种去重策略(MD5 或元数据匹配)", true);
|
||||
return;
|
||||
}
|
||||
|
||||
// 收集所有音频文件
|
||||
List<Path> audioFiles = new ArrayList<>();
|
||||
Files.walkFileTree(libraryPath, new SimpleFileVisitor<Path>() {
|
||||
@Override
|
||||
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
|
||||
if (isAudioFile(file)) {
|
||||
audioFiles.add(file);
|
||||
}
|
||||
return FileVisitResult.CONTINUE;
|
||||
}
|
||||
});
|
||||
|
||||
int total = audioFiles.size();
|
||||
if (total == 0) {
|
||||
sendProgress(taskId, 0, 0, 0, 0,
|
||||
"未在音乐库中找到音频文件", true);
|
||||
return;
|
||||
}
|
||||
|
||||
AtomicInteger scanned = new AtomicInteger(0);
|
||||
AtomicInteger duplicateGroups = new AtomicInteger(0);
|
||||
AtomicInteger moved = new AtomicInteger(0);
|
||||
AtomicInteger failed = new AtomicInteger(0);
|
||||
|
||||
sendProgress(taskId, total, 0, 0, 0,
|
||||
"开始扫描音乐库...", false);
|
||||
|
||||
Map<String, List<Path>> md5Groups = new HashMap<>();
|
||||
Map<MetadataKey, List<Path>> metadataGroups = new HashMap<>();
|
||||
|
||||
// 第一阶段:扫描并根据配置构建分组
|
||||
for (Path file : audioFiles) {
|
||||
try {
|
||||
if (useMd5) {
|
||||
String md5 = calculateMd5(file);
|
||||
md5Groups.computeIfAbsent(md5, k -> new ArrayList<>()).add(file);
|
||||
}
|
||||
|
||||
if (useMetadata) {
|
||||
Optional<MetadataKey> keyOpt = readMetadataKey(file);
|
||||
keyOpt.ifPresent(key -> metadataGroups
|
||||
.computeIfAbsent(key, k -> new ArrayList<>())
|
||||
.add(file));
|
||||
}
|
||||
|
||||
int currentScanned = scanned.incrementAndGet();
|
||||
if (currentScanned % 50 == 0) {
|
||||
sendProgress(taskId, total, currentScanned,
|
||||
duplicateGroups.get(), moved.get(),
|
||||
String.format("扫描中(%d/%d)", currentScanned, total),
|
||||
false);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
failed.incrementAndGet();
|
||||
log.warn("扫描文件失败: {}", file, e);
|
||||
}
|
||||
}
|
||||
|
||||
// 第二阶段:处理 MD5 去重结果(完全二进制重复)
|
||||
if (useMd5) {
|
||||
for (Map.Entry<String, List<Path>> entry : md5Groups.entrySet()) {
|
||||
List<Path> group = entry.getValue();
|
||||
if (group.size() <= 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
duplicateGroups.incrementAndGet();
|
||||
|
||||
Path keep = chooseBestFileByScore(group);
|
||||
List<Path> duplicates = new ArrayList<>(group);
|
||||
duplicates.remove(keep);
|
||||
|
||||
moved.addAndGet(handleDuplicates(duplicates, keep, trashPath, mode, taskId, total,
|
||||
scanned, duplicateGroups, failed));
|
||||
}
|
||||
}
|
||||
|
||||
if (useMetadata) {
|
||||
// 第三阶段:处理元数据匹配去重结果
|
||||
for (Map.Entry<MetadataKey, List<Path>> entry : metadataGroups.entrySet()) {
|
||||
List<Path> group = entry.getValue();
|
||||
if (group.size() <= 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
duplicateGroups.incrementAndGet();
|
||||
|
||||
Path keep = chooseBestFileByScore(group);
|
||||
List<Path> duplicates = new ArrayList<>(group);
|
||||
duplicates.remove(keep);
|
||||
|
||||
moved.addAndGet(handleDuplicates(duplicates, keep, trashPath, mode, taskId, total,
|
||||
scanned, duplicateGroups, failed));
|
||||
}
|
||||
}
|
||||
|
||||
sendProgress(taskId, total, scanned.get(),
|
||||
duplicateGroups.get(), moved.get(),
|
||||
String.format("任务完成!扫描文件: %d, 重复组: %d, 移动/复制文件: %d",
|
||||
scanned.get(), duplicateGroups.get(), moved.get()),
|
||||
true);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("去重任务执行失败", e);
|
||||
sendProgress(taskId, 0, 0, 0, 0,
|
||||
"任务执行失败: " + e.getMessage(), true);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isAudioFile(Path file) {
|
||||
String name = file.getFileName().toString().toLowerCase();
|
||||
int idx = name.lastIndexOf('.');
|
||||
if (idx <= 0 || idx == name.length() - 1) {
|
||||
return false;
|
||||
}
|
||||
String ext = name.substring(idx + 1);
|
||||
return AUDIO_EXTENSIONS.contains(ext);
|
||||
}
|
||||
|
||||
private String calculateMd5(Path file) throws IOException, NoSuchAlgorithmException {
|
||||
MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
try (InputStream is = Files.newInputStream(file);
|
||||
DigestInputStream dis = new DigestInputStream(is, md)) {
|
||||
byte[] buffer = new byte[8192];
|
||||
// 读取整个文件,结果自动更新到 md 中
|
||||
while (dis.read(buffer) != -1) {
|
||||
// no-op
|
||||
}
|
||||
}
|
||||
byte[] digest = md.digest();
|
||||
StringBuilder sb = new StringBuilder(digest.length * 2);
|
||||
for (byte b : digest) {
|
||||
sb.append(String.format("%02x", b));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 元数据分组键:艺术家 + 标题 + 专辑 + 时长(按 5 秒误差归一)
|
||||
*/
|
||||
private static class MetadataKey {
|
||||
private final String artist;
|
||||
private final String title;
|
||||
private final String album;
|
||||
private final int normalizedDuration;
|
||||
|
||||
private MetadataKey(String artist, String title, String album, int normalizedDuration) {
|
||||
this.artist = artist;
|
||||
this.title = title;
|
||||
this.album = album;
|
||||
this.normalizedDuration = normalizedDuration;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof MetadataKey)) return false;
|
||||
MetadataKey that = (MetadataKey) o;
|
||||
return normalizedDuration == that.normalizedDuration &&
|
||||
Objects.equals(artist, that.artist) &&
|
||||
Objects.equals(title, that.title) &&
|
||||
Objects.equals(album, that.album);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(artist, title, album, normalizedDuration);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从音频文件读取用于匹配的元数据键
|
||||
*/
|
||||
private Optional<MetadataKey> readMetadataKey(Path file) {
|
||||
try {
|
||||
AudioFile audioFile = AudioFileIO.read(file.toFile());
|
||||
Tag tag = audioFile.getTag();
|
||||
if (tag == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
String artist = normalize(tag.getFirst(FieldKey.ARTIST));
|
||||
String title = normalize(tag.getFirst(FieldKey.TITLE));
|
||||
String album = normalize(tag.getFirst(FieldKey.ALBUM));
|
||||
int lengthSec = audioFile.getAudioHeader().getTrackLength();
|
||||
|
||||
if (artist.isEmpty() || title.isEmpty()) {
|
||||
// 核心标签缺失则跳过元数据分组
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
// 将时长按 5 秒误差容忍度归一化
|
||||
int normalizedDuration = lengthSec / DURATION_TOLERANCE_SECONDS;
|
||||
|
||||
return Optional.of(new MetadataKey(artist, title, album, normalizedDuration));
|
||||
} catch (Exception e) {
|
||||
// 标签损坏或不支持的格式时,忽略元数据去重
|
||||
log.debug("读取元数据失败: {}", file, e);
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private String normalize(String s) {
|
||||
if (s == null) {
|
||||
return "";
|
||||
}
|
||||
return s.trim().toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* 对一组候选文件进行综合评分,选择最佳保留文件
|
||||
*
|
||||
* 评分策略:
|
||||
* - 格式优先:FLAC > 其他无损 > 有损
|
||||
* - 码率优先:高码率得分更高(如果可获取)
|
||||
* - 文件大小:极小文件减分
|
||||
* - 文件名噪声惩罚:含样本/preview 等噪声词减分
|
||||
*/
|
||||
private Path chooseBestFileByScore(List<Path> candidates) {
|
||||
if (candidates.size() == 1) {
|
||||
return candidates.get(0);
|
||||
}
|
||||
|
||||
return candidates.stream()
|
||||
.max(Comparator.comparingDouble(this::scoreFile))
|
||||
.orElse(candidates.get(0));
|
||||
}
|
||||
|
||||
private double scoreFile(Path file) {
|
||||
double score = 0.0;
|
||||
|
||||
String name = file.getFileName().toString().toLowerCase();
|
||||
String ext = "";
|
||||
int idx = name.lastIndexOf('.');
|
||||
if (idx > 0 && idx < name.length() - 1) {
|
||||
ext = name.substring(idx + 1);
|
||||
}
|
||||
|
||||
// 格式权重
|
||||
if ("flac".equals(ext)) {
|
||||
score += 100;
|
||||
} else if (Arrays.asList("wav", "ape", "aiff", "aif", "wv", "tta").contains(ext)) {
|
||||
score += 80;
|
||||
} else {
|
||||
score += 50; // 有损格式
|
||||
}
|
||||
|
||||
// 文件大小(KB)加权:更大的通常音质更好,但极大文件不再线性加分
|
||||
try {
|
||||
long size = Files.size(file);
|
||||
double sizeKB = size / 1024.0;
|
||||
if (sizeKB < 128) {
|
||||
score -= 30; // 极小文件,疑似样本/损坏
|
||||
} else {
|
||||
score += Math.min(sizeKB / 100.0, 40.0);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// 忽略大小获取失败
|
||||
}
|
||||
|
||||
// 文件名噪声惩罚
|
||||
if (name.contains("sample") || name.contains("preview") || name.contains("demo")) {
|
||||
score -= 20;
|
||||
}
|
||||
if (name.matches(".*\\b(live|remix|karaoke)\\b.*")) {
|
||||
// 某些版本可能不是首选,略微扣分(具体偏好可根据需要调整)
|
||||
score -= 5;
|
||||
}
|
||||
|
||||
// TODO:如有需要,可从音频头中读取比特率,进一步加权
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将重复文件移动/复制到回收站,并更新统计与进度
|
||||
*
|
||||
* @return 实际成功移动/复制的文件数量
|
||||
*/
|
||||
private int handleDuplicates(List<Path> duplicates,
|
||||
Path keep,
|
||||
Path trashPath,
|
||||
String mode,
|
||||
String taskId,
|
||||
int total,
|
||||
AtomicInteger scanned,
|
||||
AtomicInteger duplicateGroups,
|
||||
AtomicInteger failed) {
|
||||
int movedCount = 0;
|
||||
for (Path dup : duplicates) {
|
||||
try {
|
||||
Path target = resolveTargetFile(trashPath, dup.getFileName().toString());
|
||||
if ("move".equalsIgnoreCase(mode)) {
|
||||
Files.move(dup, target, StandardCopyOption.REPLACE_EXISTING);
|
||||
} else {
|
||||
Files.copy(dup, target, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
movedCount++;
|
||||
sendProgress(taskId, total, scanned.get(),
|
||||
duplicateGroups.get(), movedCount,
|
||||
String.format("重复文件: %s (保留: %s)",
|
||||
dup.getFileName(), keep.getFileName()),
|
||||
false);
|
||||
} catch (Exception e) {
|
||||
failed.incrementAndGet();
|
||||
log.warn("处理重复文件失败: {}", dup, e);
|
||||
}
|
||||
}
|
||||
return movedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析回收站中的目标文件名,处理重名冲突
|
||||
*/
|
||||
private Path resolveTargetFile(Path targetDir, String fileName) throws IOException {
|
||||
Path target = targetDir.resolve(fileName);
|
||||
if (!Files.exists(target)) {
|
||||
return target;
|
||||
}
|
||||
|
||||
int lastDot = fileName.lastIndexOf('.');
|
||||
String base = lastDot > 0 ? fileName.substring(0, lastDot) : fileName;
|
||||
String ext = lastDot > 0 ? fileName.substring(lastDot) : "";
|
||||
int n = 1;
|
||||
while (Files.exists(target)) {
|
||||
String next = base + " (" + n + ")" + ext;
|
||||
target = targetDir.resolve(next);
|
||||
n++;
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* 发送进度消息
|
||||
*
|
||||
* 字段语义(供前端展示用):
|
||||
* - total:扫描到的音频文件总数
|
||||
* - processed:已扫描文件数
|
||||
* - success:重复组数量
|
||||
* - failed:移动/复制的重复文件数量
|
||||
*
|
||||
* 由于进度字段在不同任务中的含义略有差异,前端可根据 type === "dedup" 做专门映射。
|
||||
*/
|
||||
private void sendProgress(String taskId,
|
||||
int total,
|
||||
int processed,
|
||||
int success,
|
||||
int failed,
|
||||
String message,
|
||||
boolean completed) {
|
||||
ProgressMessage pm = new ProgressMessage();
|
||||
pm.setTaskId(taskId);
|
||||
pm.setType("dedup");
|
||||
pm.setTotal(total);
|
||||
pm.setProcessed(processed);
|
||||
pm.setSuccess(success);
|
||||
pm.setFailed(failed);
|
||||
pm.setCurrentFile(null);
|
||||
pm.setMessage(message);
|
||||
pm.setCompleted(completed);
|
||||
|
||||
progressStore.put(pm);
|
||||
messagingTemplate.convertAndSend("/topic/progress/" + taskId, pm);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user