001project_wildgrowth/backend/scripts/test-chunking-file.ts

166 lines
5.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env npx ts-node
/**
* 本地测试脚本:解析文档并分块
* 用法: npx ts-node scripts/test-chunking-file.ts <文件路径>
* 支持: .docx, .pdf, .epub
*/
import fs from 'fs';
import path from 'path';
import pdfParse from 'pdf-parse';
import mammoth from 'mammoth';
import EPub from 'epub';
import { structureChunkingService } from '../src/services/structureChunkingService';
async function parseFile(filePath: string): Promise<string> {
const ext = path.extname(filePath).toLowerCase();
const buffer = fs.readFileSync(filePath);
if (ext === '.docx') {
const result = await mammoth.extractRawText({ buffer });
return result.value;
}
if (ext === '.pdf') {
const data = await pdfParse(buffer);
return data.text;
}
if (ext === '.epub') {
return new Promise((resolve, reject) => {
const epub = new EPub(filePath);
epub.on('end', async () => {
const chapters: string[] = [];
for (const item of epub.flow || []) {
try {
const text = await new Promise<string>((res) => {
epub.getChapter(item.id, (err: any, t: string) => res(err ? '' : t || ''));
});
if (text.trim()) chapters.push(text.trim());
} catch {}
}
resolve(chapters.join('\n\n'));
});
epub.on('error', reject);
epub.parse();
});
}
throw new Error(`不支持的格式: ${ext}`);
}
async function main() {
const filePath = process.argv[2];
if (!filePath) {
console.log('用法: npx ts-node scripts/test-chunking-file.ts <文件路径>');
console.log('支持: .docx, .pdf, .epub');
process.exit(1);
}
if (!fs.existsSync(filePath)) {
console.error(`文件不存在: ${filePath}`);
process.exit(1);
}
console.log('═'.repeat(60));
console.log(`📄 文件: ${path.basename(filePath)}`);
console.log('═'.repeat(60));
try {
// 1. 解析文档
console.log('\n⏳ 解析文档...');
const text = await parseFile(filePath);
console.log(`✅ 解析完成: ${text.length.toLocaleString()} 字符`);
// 2. 分块(使用 LLM 增强版)
console.log('\n⏳ 执行分块LLM 增强版)...');
const result = await structureChunkingService.parseAsync(text);
// 3. 输出结果
console.log('\n' + '─'.repeat(60));
console.log('📊 分块结果');
console.log('─'.repeat(60));
console.log(` 识别模式: ${result.pattern || '(无结构)'}`);
console.log(` 分块数量: ${result.chunks.length}`);
console.log(` 总字符数: ${result.totalCharacters.toLocaleString()}`);
if (result.chunks.length > 0) {
console.log('\n' + '─'.repeat(60));
console.log('📋 分块列表');
console.log('─'.repeat(60));
result.chunks.forEach((chunk, i) => {
const preview = chunk.content.replace(/\s+/g, ' ').substring(0, 60);
console.log(`\n[${i + 1}] ${chunk.title}`);
console.log(` 字符: ${chunk.content.length.toLocaleString()}`);
console.log(` 预览: ${preview}...`);
});
// 4. 潜在问题检测
console.log('\n' + '─'.repeat(60));
console.log('🔍 潜在问题检测');
console.log('─'.repeat(60));
let issues: string[] = [];
// 检查分块数量异常
if (result.chunks.length > 50) {
issues.push(`⚠️ 分块数量较多 (${result.chunks.length}),可能存在误匹配`);
}
if (result.chunks.length === 1 && result.totalCharacters > 5000) {
issues.push(`⚠️ 只有1个分块但内容很长可能未正确识别结构`);
}
// 检查分块大小差异
const sizes = result.chunks.map(c => c.content.length);
const avgSize = sizes.reduce((a, b) => a + b, 0) / sizes.length;
const tooSmall = sizes.filter(s => s < 100).length;
const tooLarge = sizes.filter(s => s > avgSize * 5).length;
if (tooSmall > 0) {
issues.push(`⚠️ ${tooSmall} 个分块内容过短 (<100字符),可能是误匹配`);
}
if (tooLarge > 0) {
issues.push(`⚠️ ${tooLarge} 个分块内容过长,分块可能不均匀`);
}
// 检查标题异常
const shortTitles = result.chunks.filter(c => c.title.length < 3);
if (shortTitles.length > 0) {
issues.push(`⚠️ ${shortTitles.length} 个分块标题过短`);
}
// 检查重复标题
const titleSet = new Set(result.chunks.map(c => c.title));
if (titleSet.size < result.chunks.length) {
issues.push(`⚠️ 存在重复标题,可能是目录或列表被误匹配`);
}
if (issues.length === 0) {
console.log(' ✅ 未发现明显问题');
} else {
issues.forEach(issue => console.log(` ${issue}`));
}
// 5. 显示前5个分块的完整标题
console.log('\n' + '─'.repeat(60));
console.log('📝 前10个分块标题完整');
console.log('─'.repeat(60));
result.chunks.slice(0, 10).forEach((chunk, i) => {
console.log(` ${i + 1}. ${chunk.title}`);
});
if (result.chunks.length > 10) {
console.log(` ... 还有 ${result.chunks.length - 10}`);
}
}
console.log('\n' + '═'.repeat(60));
} catch (error: any) {
console.error(`❌ 处理失败: ${error.message}`);
process.exit(1);
}
}
main();