166 lines
5.3 KiB
TypeScript
166 lines
5.3 KiB
TypeScript
#!/usr/bin/env npx ts-node
|
||
/**
|
||
* 本地测试脚本:解析文档并分块
|
||
* 用法: npx ts-node scripts/test-chunking-file.ts <文件路径>
|
||
* 支持: .docx, .pdf, .epub
|
||
*/
|
||
|
||
import fs from 'fs';
|
||
import path from 'path';
|
||
import pdfParse from 'pdf-parse';
|
||
import mammoth from 'mammoth';
|
||
import EPub from 'epub';
|
||
import { structureChunkingService } from '../src/services/structureChunkingService';
|
||
|
||
async function parseFile(filePath: string): Promise<string> {
|
||
const ext = path.extname(filePath).toLowerCase();
|
||
const buffer = fs.readFileSync(filePath);
|
||
|
||
if (ext === '.docx') {
|
||
const result = await mammoth.extractRawText({ buffer });
|
||
return result.value;
|
||
}
|
||
|
||
if (ext === '.pdf') {
|
||
const data = await pdfParse(buffer);
|
||
return data.text;
|
||
}
|
||
|
||
if (ext === '.epub') {
|
||
return new Promise((resolve, reject) => {
|
||
const epub = new EPub(filePath);
|
||
epub.on('end', async () => {
|
||
const chapters: string[] = [];
|
||
for (const item of epub.flow || []) {
|
||
try {
|
||
const text = await new Promise<string>((res) => {
|
||
epub.getChapter(item.id, (err: any, t: string) => res(err ? '' : t || ''));
|
||
});
|
||
if (text.trim()) chapters.push(text.trim());
|
||
} catch {}
|
||
}
|
||
resolve(chapters.join('\n\n'));
|
||
});
|
||
epub.on('error', reject);
|
||
epub.parse();
|
||
});
|
||
}
|
||
|
||
throw new Error(`不支持的格式: ${ext}`);
|
||
}
|
||
|
||
async function main() {
|
||
const filePath = process.argv[2];
|
||
if (!filePath) {
|
||
console.log('用法: npx ts-node scripts/test-chunking-file.ts <文件路径>');
|
||
console.log('支持: .docx, .pdf, .epub');
|
||
process.exit(1);
|
||
}
|
||
|
||
if (!fs.existsSync(filePath)) {
|
||
console.error(`文件不存在: ${filePath}`);
|
||
process.exit(1);
|
||
}
|
||
|
||
console.log('═'.repeat(60));
|
||
console.log(`📄 文件: ${path.basename(filePath)}`);
|
||
console.log('═'.repeat(60));
|
||
|
||
try {
|
||
// 1. 解析文档
|
||
console.log('\n⏳ 解析文档...');
|
||
const text = await parseFile(filePath);
|
||
console.log(`✅ 解析完成: ${text.length.toLocaleString()} 字符`);
|
||
|
||
// 2. 分块(使用 LLM 增强版)
|
||
console.log('\n⏳ 执行分块(LLM 增强版)...');
|
||
const result = await structureChunkingService.parseAsync(text);
|
||
|
||
// 3. 输出结果
|
||
console.log('\n' + '─'.repeat(60));
|
||
console.log('📊 分块结果');
|
||
console.log('─'.repeat(60));
|
||
console.log(` 识别模式: ${result.pattern || '(无结构)'}`);
|
||
console.log(` 分块数量: ${result.chunks.length}`);
|
||
console.log(` 总字符数: ${result.totalCharacters.toLocaleString()}`);
|
||
|
||
if (result.chunks.length > 0) {
|
||
console.log('\n' + '─'.repeat(60));
|
||
console.log('📋 分块列表');
|
||
console.log('─'.repeat(60));
|
||
|
||
result.chunks.forEach((chunk, i) => {
|
||
const preview = chunk.content.replace(/\s+/g, ' ').substring(0, 60);
|
||
console.log(`\n[${i + 1}] ${chunk.title}`);
|
||
console.log(` 字符: ${chunk.content.length.toLocaleString()}`);
|
||
console.log(` 预览: ${preview}...`);
|
||
});
|
||
|
||
// 4. 潜在问题检测
|
||
console.log('\n' + '─'.repeat(60));
|
||
console.log('🔍 潜在问题检测');
|
||
console.log('─'.repeat(60));
|
||
|
||
let issues: string[] = [];
|
||
|
||
// 检查分块数量异常
|
||
if (result.chunks.length > 50) {
|
||
issues.push(`⚠️ 分块数量较多 (${result.chunks.length}),可能存在误匹配`);
|
||
}
|
||
if (result.chunks.length === 1 && result.totalCharacters > 5000) {
|
||
issues.push(`⚠️ 只有1个分块但内容很长,可能未正确识别结构`);
|
||
}
|
||
|
||
// 检查分块大小差异
|
||
const sizes = result.chunks.map(c => c.content.length);
|
||
const avgSize = sizes.reduce((a, b) => a + b, 0) / sizes.length;
|
||
const tooSmall = sizes.filter(s => s < 100).length;
|
||
const tooLarge = sizes.filter(s => s > avgSize * 5).length;
|
||
|
||
if (tooSmall > 0) {
|
||
issues.push(`⚠️ ${tooSmall} 个分块内容过短 (<100字符),可能是误匹配`);
|
||
}
|
||
if (tooLarge > 0) {
|
||
issues.push(`⚠️ ${tooLarge} 个分块内容过长,分块可能不均匀`);
|
||
}
|
||
|
||
// 检查标题异常
|
||
const shortTitles = result.chunks.filter(c => c.title.length < 3);
|
||
if (shortTitles.length > 0) {
|
||
issues.push(`⚠️ ${shortTitles.length} 个分块标题过短`);
|
||
}
|
||
|
||
// 检查重复标题
|
||
const titleSet = new Set(result.chunks.map(c => c.title));
|
||
if (titleSet.size < result.chunks.length) {
|
||
issues.push(`⚠️ 存在重复标题,可能是目录或列表被误匹配`);
|
||
}
|
||
|
||
if (issues.length === 0) {
|
||
console.log(' ✅ 未发现明显问题');
|
||
} else {
|
||
issues.forEach(issue => console.log(` ${issue}`));
|
||
}
|
||
|
||
// 5. 显示前5个分块的完整标题
|
||
console.log('\n' + '─'.repeat(60));
|
||
console.log('📝 前10个分块标题(完整)');
|
||
console.log('─'.repeat(60));
|
||
result.chunks.slice(0, 10).forEach((chunk, i) => {
|
||
console.log(` ${i + 1}. ${chunk.title}`);
|
||
});
|
||
if (result.chunks.length > 10) {
|
||
console.log(` ... 还有 ${result.chunks.length - 10} 个`);
|
||
}
|
||
}
|
||
|
||
console.log('\n' + '═'.repeat(60));
|
||
|
||
} catch (error: any) {
|
||
console.error(`❌ 处理失败: ${error.message}`);
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
main();
|