187d0283a5
- Add support for PNG, JPEG, and WebP image formats in flyer import - Replace external importer service with internal AI-based parsing pipeline - Add new services: TextExtractorService, AiFlyerParserService, FlyerNormalizerService - Integrate Mistral AI, pdf-parse, and tesseract.js dependencies - Add quality confidence indicators and warning panels in Flutter UI - Update package.json with new dependencies and transform ignore patterns - Add documentation for flyer importer system - Add Kilo AI planning file for Happy Island project BREAKING CHANGE: Flyer import now uses internal AI parsing instead of external importer service
101 lines
3.2 KiB
TypeScript
101 lines
3.2 KiB
TypeScript
import { Injectable, Logger } from '@nestjs/common';
|
|
import * as fs from 'fs';
|
|
import * as os from 'os';
|
|
import * as path from 'path';
|
|
import * as pdf from 'pdf-parse';
|
|
import Tesseract from 'tesseract.js';
|
|
|
|
@Injectable()
|
|
export class TextExtractorService {
|
|
private readonly logger = new Logger(TextExtractorService.name);
|
|
|
|
/**
|
|
* Extraherar text från en PDF-buffer.
|
|
* Försöker med pdf-parse först; om det inte ger resultat, fallback till OCR.
|
|
*
|
|
* @param buffer PDF-fil som buffer
|
|
* @returns Extraherad text
|
|
*/
|
|
async extractText(
|
|
buffer: Buffer,
|
|
mimeType?: string,
|
|
originalFilename?: string,
|
|
): Promise<string> {
|
|
// Försök primär PDF-extract
|
|
try {
|
|
this.logger.debug('Attempting pdf-parse extraction');
|
|
const pdfData = await pdf(buffer);
|
|
|
|
const text = pdfData.text?.trim() || '';
|
|
const wordCount = text.split(/\s+/).filter(w => w.length > 0).length;
|
|
|
|
this.logger.debug(`pdf-parse extracted ${wordCount} words`);
|
|
|
|
// Om vi fick tillräckligt med text, returnera det
|
|
if (wordCount >= 10) {
|
|
return text;
|
|
}
|
|
|
|
this.logger.debug('pdf-parse gave too little text, falling back to OCR');
|
|
} catch (err) {
|
|
this.logger.warn(`pdf-parse failed: ${String(err)}`);
|
|
}
|
|
|
|
// Fallback: OCR med Tesseract
|
|
return this.extractTextViaOCR(buffer, mimeType, originalFilename);
|
|
}
|
|
|
|
/**
|
|
* Extraherar text från en PDF eller bild via OCR (Tesseract).
|
|
*
|
|
* @param buffer Fil-buffer (PDF eller bild)
|
|
* @returns Extraherad text
|
|
*/
|
|
private async extractTextViaOCR(
|
|
buffer: Buffer,
|
|
mimeType?: string,
|
|
originalFilename?: string,
|
|
): Promise<string> {
|
|
try {
|
|
this.logger.debug('Starting Tesseract OCR extraction');
|
|
|
|
// Tesseract.js kräver en sökväg eller data-URL; vi skriver temporär fil
|
|
const ext = this.resolveTempExtension(mimeType, originalFilename);
|
|
const tempPath = path.join(os.tmpdir(), `ocr-${Date.now()}${ext}`);
|
|
await fs.promises.writeFile(tempPath, buffer);
|
|
|
|
try {
|
|
const result = await Tesseract.recognize(tempPath, 'swe', {
|
|
logger: (m) => this.logger.debug(`Tesseract: ${m.status}`),
|
|
});
|
|
|
|
const text = result.data.text || '';
|
|
this.logger.debug(`Tesseract extracted ${text.split(/\s+/).length} words`);
|
|
return text;
|
|
} finally {
|
|
try {
|
|
await fs.promises.unlink(tempPath);
|
|
} catch {
|
|
// ignorera om cleanup misslyckas
|
|
}
|
|
}
|
|
} catch (err) {
|
|
this.logger.error(`OCR extraction failed: ${String(err)}`);
|
|
throw new Error('Kunde inte extrahera text från flyern (pdf-parse + OCR misslyckades).');
|
|
}
|
|
}
|
|
|
|
private resolveTempExtension(mimeType?: string, originalFilename?: string): string {
|
|
if (mimeType === 'image/png') return '.png';
|
|
if (mimeType === 'image/webp') return '.webp';
|
|
if (mimeType === 'image/jpeg') return '.jpg';
|
|
if (mimeType === 'text/plain') return '.txt';
|
|
if (mimeType === 'application/pdf') return '.pdf';
|
|
|
|
const originalExt = originalFilename ? path.extname(originalFilename).toLowerCase() : '';
|
|
if (originalExt) return originalExt;
|
|
|
|
return '.pdf';
|
|
}
|
|
}
|