Files
recipe-app/backend/src/flyer-import/services/text-extractor.service.ts
T
Nils-Johan Gynther 187d0283a5
Test Suite / quick-import-pr-quick (push) Has been skipped
Test Suite / backend-full (push) Successful in 2m31s
Test Suite / flutter-quality (push) Failing after 3m48s
Test Suite / backend-pr-quick (push) Failing after 13m57s
feat(flyer-import): integrate AI-based flyer parsing with image support
- Add support for PNG, JPEG, and WebP image formats in flyer import
- Replace external importer service with internal AI-based parsing pipeline
- Add new services: TextExtractorService, AiFlyerParserService, FlyerNormalizerService
- Integrate Mistral AI, pdf-parse, and tesseract.js dependencies
- Add quality confidence indicators and warning panels in Flutter UI
- Update package.json with new dependencies and transform ignore patterns
- Add documentation for flyer importer system
- Add Kilo AI planning file for Happy Island project

BREAKING CHANGE: Flyer import now uses internal AI parsing instead of external importer service
2026-05-19 19:57:54 +02:00

101 lines
3.2 KiB
TypeScript

import { Injectable, Logger } from '@nestjs/common';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import * as pdf from 'pdf-parse';
import Tesseract from 'tesseract.js';
@Injectable()
export class TextExtractorService {
private readonly logger = new Logger(TextExtractorService.name);
/**
* Extraherar text från en PDF-buffer.
* Försöker med pdf-parse först; om det inte ger resultat, fallback till OCR.
*
* @param buffer PDF-fil som buffer
* @returns Extraherad text
*/
async extractText(
buffer: Buffer,
mimeType?: string,
originalFilename?: string,
): Promise<string> {
// Försök primär PDF-extract
try {
this.logger.debug('Attempting pdf-parse extraction');
const pdfData = await pdf(buffer);
const text = pdfData.text?.trim() || '';
const wordCount = text.split(/\s+/).filter(w => w.length > 0).length;
this.logger.debug(`pdf-parse extracted ${wordCount} words`);
// Om vi fick tillräckligt med text, returnera det
if (wordCount >= 10) {
return text;
}
this.logger.debug('pdf-parse gave too little text, falling back to OCR');
} catch (err) {
this.logger.warn(`pdf-parse failed: ${String(err)}`);
}
// Fallback: OCR med Tesseract
return this.extractTextViaOCR(buffer, mimeType, originalFilename);
}
/**
* Extraherar text från en PDF eller bild via OCR (Tesseract).
*
* @param buffer Fil-buffer (PDF eller bild)
* @returns Extraherad text
*/
private async extractTextViaOCR(
buffer: Buffer,
mimeType?: string,
originalFilename?: string,
): Promise<string> {
try {
this.logger.debug('Starting Tesseract OCR extraction');
// Tesseract.js kräver en sökväg eller data-URL; vi skriver temporär fil
const ext = this.resolveTempExtension(mimeType, originalFilename);
const tempPath = path.join(os.tmpdir(), `ocr-${Date.now()}${ext}`);
await fs.promises.writeFile(tempPath, buffer);
try {
const result = await Tesseract.recognize(tempPath, 'swe', {
logger: (m) => this.logger.debug(`Tesseract: ${m.status}`),
});
const text = result.data.text || '';
this.logger.debug(`Tesseract extracted ${text.split(/\s+/).length} words`);
return text;
} finally {
try {
await fs.promises.unlink(tempPath);
} catch {
// ignorera om cleanup misslyckas
}
}
} catch (err) {
this.logger.error(`OCR extraction failed: ${String(err)}`);
throw new Error('Kunde inte extrahera text från flyern (pdf-parse + OCR misslyckades).');
}
}
private resolveTempExtension(mimeType?: string, originalFilename?: string): string {
if (mimeType === 'image/png') return '.png';
if (mimeType === 'image/webp') return '.webp';
if (mimeType === 'image/jpeg') return '.jpg';
if (mimeType === 'text/plain') return '.txt';
if (mimeType === 'application/pdf') return '.pdf';
const originalExt = originalFilename ? path.extname(originalFilename).toLowerCase() : '';
if (originalExt) return originalExt;
return '.pdf';
}
}