feat(flyer-import): integrate AI-based flyer parsing with image support
- Add support for PNG, JPEG, and WebP image formats in flyer import - Replace external importer service with internal AI-based parsing pipeline - Add new services: TextExtractorService, AiFlyerParserService, FlyerNormalizerService - Integrate Mistral AI, pdf-parse, and tesseract.js dependencies - Add quality confidence indicators and warning panels in Flutter UI - Update package.json with new dependencies and transform ignore patterns - Add documentation for flyer importer system - Add Kilo AI planning file for Happy Island project BREAKING CHANGE: Flyer import now uses internal AI parsing instead of external importer service
This commit is contained in:
@@ -0,0 +1,100 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import * as pdf from 'pdf-parse';
|
||||
import Tesseract from 'tesseract.js';
|
||||
|
||||
@Injectable()
|
||||
export class TextExtractorService {
|
||||
private readonly logger = new Logger(TextExtractorService.name);
|
||||
|
||||
/**
|
||||
* Extraherar text från en PDF-buffer.
|
||||
* Försöker med pdf-parse först; om det inte ger resultat, fallback till OCR.
|
||||
*
|
||||
* @param buffer PDF-fil som buffer
|
||||
* @returns Extraherad text
|
||||
*/
|
||||
async extractText(
|
||||
buffer: Buffer,
|
||||
mimeType?: string,
|
||||
originalFilename?: string,
|
||||
): Promise<string> {
|
||||
// Försök primär PDF-extract
|
||||
try {
|
||||
this.logger.debug('Attempting pdf-parse extraction');
|
||||
const pdfData = await pdf(buffer);
|
||||
|
||||
const text = pdfData.text?.trim() || '';
|
||||
const wordCount = text.split(/\s+/).filter(w => w.length > 0).length;
|
||||
|
||||
this.logger.debug(`pdf-parse extracted ${wordCount} words`);
|
||||
|
||||
// Om vi fick tillräckligt med text, returnera det
|
||||
if (wordCount >= 10) {
|
||||
return text;
|
||||
}
|
||||
|
||||
this.logger.debug('pdf-parse gave too little text, falling back to OCR');
|
||||
} catch (err) {
|
||||
this.logger.warn(`pdf-parse failed: ${String(err)}`);
|
||||
}
|
||||
|
||||
// Fallback: OCR med Tesseract
|
||||
return this.extractTextViaOCR(buffer, mimeType, originalFilename);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extraherar text från en PDF eller bild via OCR (Tesseract).
|
||||
*
|
||||
* @param buffer Fil-buffer (PDF eller bild)
|
||||
* @returns Extraherad text
|
||||
*/
|
||||
private async extractTextViaOCR(
|
||||
buffer: Buffer,
|
||||
mimeType?: string,
|
||||
originalFilename?: string,
|
||||
): Promise<string> {
|
||||
try {
|
||||
this.logger.debug('Starting Tesseract OCR extraction');
|
||||
|
||||
// Tesseract.js kräver en sökväg eller data-URL; vi skriver temporär fil
|
||||
const ext = this.resolveTempExtension(mimeType, originalFilename);
|
||||
const tempPath = path.join(os.tmpdir(), `ocr-${Date.now()}${ext}`);
|
||||
await fs.promises.writeFile(tempPath, buffer);
|
||||
|
||||
try {
|
||||
const result = await Tesseract.recognize(tempPath, 'swe', {
|
||||
logger: (m) => this.logger.debug(`Tesseract: ${m.status}`),
|
||||
});
|
||||
|
||||
const text = result.data.text || '';
|
||||
this.logger.debug(`Tesseract extracted ${text.split(/\s+/).length} words`);
|
||||
return text;
|
||||
} finally {
|
||||
try {
|
||||
await fs.promises.unlink(tempPath);
|
||||
} catch {
|
||||
// ignorera om cleanup misslyckas
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.error(`OCR extraction failed: ${String(err)}`);
|
||||
throw new Error('Kunde inte extrahera text från flyern (pdf-parse + OCR misslyckades).');
|
||||
}
|
||||
}
|
||||
|
||||
private resolveTempExtension(mimeType?: string, originalFilename?: string): string {
|
||||
if (mimeType === 'image/png') return '.png';
|
||||
if (mimeType === 'image/webp') return '.webp';
|
||||
if (mimeType === 'image/jpeg') return '.jpg';
|
||||
if (mimeType === 'text/plain') return '.txt';
|
||||
if (mimeType === 'application/pdf') return '.pdf';
|
||||
|
||||
const originalExt = originalFilename ? path.extname(originalFilename).toLowerCase() : '';
|
||||
if (originalExt) return originalExt;
|
||||
|
||||
return '.pdf';
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user