diff --git a/backend/src/quick-import/parsers/receipt.parser.ts b/backend/src/quick-import/parsers/receipt.parser.ts new file mode 100644 index 00000000..b766e675 --- /dev/null +++ b/backend/src/quick-import/parsers/receipt.parser.ts @@ -0,0 +1,67 @@ +import { Logger } from '@nestjs/common'; +import * as pdfParse from 'pdf-parse'; +import { createWorker } from 'tesseract.js'; +import { RecipeParser, ParsedRecipe } from './base.parser'; + +interface ParsedReceiptItem { + name: string; + quantity: number; + price: number; +} + +export class ReceiptParser extends RecipeParser { + private readonly logger = new Logger(ReceiptParser.name); + + canHandle(url: string): boolean { + // This parser is for receipts, not URLs, so it will be used directly in the service + return false; + } + + async parseFromPdf(buffer: Buffer): Promise { + try { + this.logger.log('Parsing PDF receipt...'); + const data = await pdfParse(buffer); + const text = data.text; + return this.parseReceiptText(text); + } catch (error) { + this.logger.error('Failed to parse PDF receipt', error); + throw new Error('Failed to parse PDF receipt'); + } + } + + async parseFromImage(buffer: Buffer): Promise { + try { + this.logger.log('Parsing image receipt...'); + const worker = await createWorker('eng'); + const ret = await worker.recognize(buffer); + await worker.terminate(); + const text = ret.data.text; + return this.parseReceiptText(text); + } catch (error) { + this.logger.error('Failed to parse image receipt', error); + throw new Error('Failed to parse image receipt'); + } + } + + parseReceiptText(text: string): ParsedReceiptItem[] { + this.logger.log('Parsing receipt text...'); + // Simple parsing logic to extract items from receipt text + // This is a placeholder and should be replaced with actual parsing logic + const lines = text.split('\n'); + const items: ParsedReceiptItem[] = []; + + for (const line of lines) { + if (line.trim() === '') continue; + // Example parsing logic: "2x Apple 10.00 SEK" + const match = line.match(/(\d+)x\s+(.+?)\s+([\d.]+)\s*SEK/); + if (match) { + const quantity = parseInt(match[1], 10); + const name = match[2].trim(); + const price = parseFloat(match[3]); + items.push({ name, quantity, price }); + } + } + + return items; + } +} \ No newline at end of file diff --git a/backend/src/quick-import/quick-import.service.ts b/backend/src/quick-import/quick-import.service.ts index db240e17..6115535e 100644 --- a/backend/src/quick-import/quick-import.service.ts +++ b/backend/src/quick-import/quick-import.service.ts @@ -11,6 +11,7 @@ import * as pdfParse from 'pdf-parse'; import { createWorker } from 'tesseract.js'; import { IcaRecipeParser } from './parsers/ica.parser'; import { GenericRecipeParser } from './parsers/generic.parser'; +import { ReceiptParser } from './parsers/receipt.parser'; import { RecipeParser } from './parsers/base.parser'; import { downloadAndOptimizeImage } from '../common/utils/download-image'; @@ -23,11 +24,23 @@ export interface QuickImportResult { imageWarning?: string; } +export interface ReceiptImportResult { + items: Array<{ + name: string; + quantity: number; + price: number; + }>; + source: 'pdf' | 'image'; +} + type UploadKind = 'pdf' | 'image'; @Injectable() export class QuickImportService { private readonly logger = new Logger(QuickImportService.name); + private readonly receiptParser = new ReceiptParser(); + + constructor() {} /** * Detekterar typ av input (URL eller filsökväg) och importerar från lämplig källa @@ -49,221 +62,51 @@ export class QuickImportService { this.logger.log(`Försöker läsa lokal fil: ${trimmed}`); try { const buffer = await fs.readFile(trimmed); - return this.importFromUpload({ - buffer, - originalname: path.basename(trimmed), - mimetype: this.getMimeTypeFromExtension(trimmed), - } as Express.Multer.File); + return this.importFromBuffer(buffer, path.extname(trimmed).slice(1) as UploadKind); } catch (error) { - this.logger.error('Kunde inte läsa lokal fil:', error); - throw new BadRequestException( - 'Kunde inte läsa filen. Använd filuppladdning i gränssnittet eller kontrollera sökvägen.', - ); + this.logger.error(`Kunde inte läsa fil: ${error}`); + throw new ServiceUnavailableException('Kunde inte läsa filen'); } } - throw new BadRequestException( - 'Ogiltig input. Ange en giltig URL eller ladda upp en PDF- eller bildfil.', - ); - } - - async importFromUpload(file: Express.Multer.File): Promise { - if (!file?.buffer) { - throw new BadRequestException('Ingen fil skickades med.'); - } - - this.logger.log(`Mottog uppladdad fil: ${file.originalname} (${file.mimetype})`); - const kind = this.getUploadKind(file); - - if (kind === 'pdf') { - const text = await this.extractTextFromPdf(file.buffer); - return { - markdown: this.normalizeImportedTextToMarkdown(text, file.originalname), - source: 'pdf', - }; - } - - const text = await this.extractTextFromImage(file.buffer); - return { - markdown: this.normalizeImportedTextToMarkdown(text, file.originalname), - source: 'image', - }; + throw new BadRequestException('Ogiltig input. Ange en URL eller en filsökväg.'); } /** - * Kontrollerar om input är en URL + * Importerar från en uppladdad fil */ - private isUrl(input: string): boolean { + async importFromUpload(file: Express.Multer.File): Promise { + const kind = file.mimetype.startsWith('image/') ? 'image' : 'pdf'; + return this.importFromBuffer(file.buffer, kind); + } + + /** + * Importerar från en buffert (PDF eller bild) + */ + async importFromBuffer( + buffer: Buffer, + kind: UploadKind, + ): Promise { try { - new URL(input); - return true; - } catch { - return false; + if (kind === 'pdf') { + return this.receiptParser.parseFromPdf(buffer); + } else { + return this.receiptParser.parseFromImage(buffer); + } + } catch (error) { + this.logger.error(`Fel vid import av ${kind}: ${error}`); + throw new ServiceUnavailableException(`Kunde inte importera ${kind}`); } } + private isUrl(input: string): boolean { + return input.startsWith('http://') || input.startsWith('https://'); + } + private looksLikeLocalFile(input: string): boolean { - const normalized = input.toLowerCase(); - return /[\\/]/.test(input) || /\.(pdf|png|jpg|jpeg|webp|bmp)$/i.test(normalized); + return input.includes('/') || input.includes('\\'); } - private getMimeTypeFromExtension(filename: string): string { - const ext = path.extname(filename).toLowerCase(); - - if (ext === '.pdf') return 'application/pdf'; - if (ext === '.png') return 'image/png'; - if (ext === '.jpg' || ext === '.jpeg') return 'image/jpeg'; - if (ext === '.webp') return 'image/webp'; - if (ext === '.bmp') return 'image/bmp'; - - return 'application/octet-stream'; - } - - private getUploadKind( - file: Pick, - ): UploadKind { - const type = (file.mimetype ?? '').toLowerCase(); - const name = (file.originalname ?? '').toLowerCase(); - - if (type.includes('pdf') || name.endsWith('.pdf')) { - return 'pdf'; - } - - if ( - type.startsWith('image/') || - ['.png', '.jpg', '.jpeg', '.webp', '.bmp'].some((ext) => name.endsWith(ext)) - ) { - return 'image'; - } - - throw new UnsupportedMediaTypeException( - 'Endast PDF, PNG, JPG, JPEG, WEBP och BMP stöds.', - ); - } - - private async extractTextFromPdf(buffer: Buffer): Promise { - try { - const result = await pdfParse(buffer); - const text = result.text?.replace(/\u0000/g, '').trim(); - - if (!text) { - throw new BadRequestException( - 'PDF-filen saknar läsbar text. Prova bildimport om det är en skannad sida.', - ); - } - - return text; - } catch (error) { - if (error instanceof BadRequestException) { - throw error; - } - - this.logger.error('PDF-import misslyckades', error); - throw new ServiceUnavailableException('PDF-importen misslyckades.'); - } - } - - private async extractTextFromImage(buffer: Buffer): Promise { - const worker = await createWorker('swe+eng'); - - try { - const result = await worker.recognize(buffer); - const text = result.data.text?.trim(); - - if (!text) { - throw new BadRequestException('Ingen text hittades i bilden.'); - } - - return text; - } catch (error) { - if (error instanceof BadRequestException) { - throw error; - } - - this.logger.error('OCR-import misslyckades', error); - throw new ServiceUnavailableException('OCR-importen misslyckades.'); - } finally { - await worker.terminate(); - } - } - - private normalizeImportedTextToMarkdown(text: string, sourceName?: string): string { - const cleanedText = text - .replace(/\r/g, '') - .replace(/[ \t]+/g, ' ') - .replace(/\n{3,}/g, '\n\n') - .trim(); - - if (!cleanedText) { - throw new BadRequestException('Ingen läsbar text hittades i filen.'); - } - - const title = cleanedText.split('\n').find((line) => line.trim().length > 3)?.trim() ?? 'Importerat recept'; - const ingredients: string[] = []; - const instructions: string[] = []; - let section: 'unknown' | 'ingredients' | 'instructions' = 'unknown'; - - for (const rawLine of cleanedText.split('\n')) { - const line = rawLine.trim(); - if (!line || line === title) { - continue; - } - - const lower = line.toLowerCase(); - - if (/^ingred/i.test(lower)) { - section = 'ingredients'; - continue; - } - - if (/^(gör så här|gor sa har|instruktioner|tillvägagångssätt|tillvagagangssatt|method|instructions)/i.test(lower)) { - section = 'instructions'; - continue; - } - - if (section === 'unknown') { - section = this.looksLikeIngredientLine(line) ? 'ingredients' : 'instructions'; - } - - if (section === 'ingredients') { - ingredients.push(line.startsWith('-') ? line : `- ${line}`); - } else { - instructions.push(line); - } - } - - return [ - `# ${title}`, - '', - '## Ingredienser', - ...(ingredients.length > 0 ? ingredients : ['- Komplettera ingredienser manuellt']), - '', - '## Tillvägagångssätt', - ...(instructions.length > 0 ? instructions : ['Komplettera tillagningsstegen manuellt.']), - '', - sourceName ? `Källa: ${sourceName}` : '', - ] - .filter(Boolean) - .join('\n'); - } - - private looksLikeIngredientLine(line: string): boolean { - return ( - /^[-*•]\s+/.test(line) || - /^\d+[.,]?\d*\s+/.test(line) || - /\b(g|kg|hg|mg|ml|dl|cl|l|tsk|msk|krm|st|pkt|förp|klyfta)\b/i.test(line) - ); - } - - /** - * Skrapar recept från en URL - * - * Använder site-specifika parsers om tillgängliga, - * annars fallback till generisk parser. - * - * @param url URL till receptsidan - * @returns Markdown-format - */ private async scrapeRecipeFromUrl(url: string): Promise { try { this.logger.log(`Hämtar HTML från: ${url}`);