From f270f8510eec02f5678a283058a23952a0422aac Mon Sep 17 00:00:00 2001 From: Nils-Johan Gynther Date: Sun, 3 May 2026 11:19:58 +0200 Subject: [PATCH] feat: Enhance receipt parsing with additional preprocessing functions and improved AI response handling Co-authored-by: Copilot --- .../receipt-parsing.service.ts | 229 ++++++++++++++++-- 1 file changed, 209 insertions(+), 20 deletions(-) diff --git a/backend/src/receipt-parsing/receipt-parsing.service.ts b/backend/src/receipt-parsing/receipt-parsing.service.ts index 8cf4142..625df05 100644 --- a/backend/src/receipt-parsing/receipt-parsing.service.ts +++ b/backend/src/receipt-parsing/receipt-parsing.service.ts @@ -10,6 +10,21 @@ const MISTRAL_API_URL = 'https://api.mistral.ai/v1/chat/completions'; const RECEIPT_VISION_MODEL = 'mistral-small-2603'; // vision — används för bild-input const RECEIPT_TEXT_MODEL = 'mistral-small-latest'; // text — används som AI-fallback för PDF/OCR-text const MAX_RETRIES = 3; +const REQUEST_TIMEOUT_MS = 25_000; + +const ALLOWED_UNITS = new Set([ + 'st', + 'kg', + 'g', + 'l', + 'dl', + 'cl', + 'ml', + 'förp', + 'pak', + 'burk', + 'flaska', +]); const QUANTITY_RULES = ` Regler för quantity och unit: @@ -46,14 +61,149 @@ export interface ParsedReceiptItemRaw { origin?: string | null; } +const NON_NAME_TOKENS = new Set([ + 'kr', + 'sek', + 'st', + 'kg', + 'g', + 'mg', + 'l', + 'dl', + 'cl', + 'ml', + 'moms', + 'summa', + 'rabatt', + 'kort', + 'kontant', + 'totalt', + 'att', + 'betala', + 'ore', + 'öre', +]); + +function isLikelyNameLikeText(value: string): boolean { + const tokens = value + .toLowerCase() + .split(/[^a-z0-9åäö]+/) + .map((t) => t.trim()) + .filter((t) => t.length >= 3); + + if (tokens.length === 0) { + return false; + } + + const meaningful = tokens.filter((token) => { + if (NON_NAME_TOKENS.has(token)) return false; + if (/^\d+$/.test(token)) return false; + if (/^\d+(?:[\.,]\d+)?$/.test(token)) return false; + return /[a-zåäö]/i.test(token); + }); + + return meaningful.length > 0; +} + +function extractNameCandidate(line: string): string | null { + const cleaned = line + .replace(/\b\d+\s*[x×]\s*\d+(?:[\.,]\d+)?\s*(ml|cl|dl|l|g|kg)\b/gi, ' ') + .replace(/\b\d+(?:[\.,]\d+)?\s*(ml|cl|dl|l|g|kg|st|fp|pkt|pak|förp)\b/gi, ' ') + .replace(/\b\d+(?:[\.,]\d{2})\b/g, ' ') + .replace(/[|*]/g, ' ') + .replace(/\s+/g, ' ') + .trim(); + + return isLikelyNameLikeText(cleaned) ? cleaned : null; +} + +function preprocessPdfLines(text: string): string[] { + const lines = text + .split('\n') + .map((l) => l.trim()) + .filter((l) => l.length > 2); + + // Vanligt i PDF-kvitton: namnrad följs av separat rad med vikt/pris. + // Slå ihop en numerisk fortsättningsrad med föregående namnrad. + const combinedLines: string[] = []; + for (const line of lines) { + const normalizedLine = line.replace(/\s+/g, ' ').trim(); + if ( + combinedLines.length > 0 && + !isLikelyNameLikeText(normalizedLine) && + isLikelyNameLikeText(combinedLines[combinedLines.length - 1]) + ) { + combinedLines[combinedLines.length - 1] = `${combinedLines[combinedLines.length - 1]} ${normalizedLine}`; + } else { + combinedLines.push(normalizedLine); + } + } + + return combinedLines; +} + +function normalizeUnit(value: string | null | undefined): string | null { + if (!value) return null; + + const unit = value.trim().toLowerCase().replace('.', ''); + if (['forp', 'förpackning', 'forpackning', 'paket', 'pkt', 'fp', 'pack'].includes(unit)) return 'förp'; + if (['styck'].includes(unit)) return 'st'; + return unit; +} + +function cleanJsonPayload(content: string): string { + const withoutFences = content + .replace(/```\s*json/gi, ' ') + .replace(/```/g, ' ') + .trim(); + + const firstArray = withoutFences.indexOf('['); + const lastArray = withoutFences.lastIndexOf(']'); + if (firstArray !== -1 && lastArray > firstArray) { + return withoutFences.slice(firstArray, lastArray + 1).trim(); + } + + return withoutFences; +} + +function hasAnyLetter(value: string): boolean { + return /[a-zåäö]/i.test(value); +} + +function normalizeParsedItem(input: any): ParsedReceiptItemRaw | null { + if (!input || typeof input !== 'object') return null; + + const rawName = typeof input.rawName === 'string' ? input.rawName.trim() : ''; + if (!rawName || !hasAnyLetter(rawName)) return null; + + const quantity = Number(input.quantity); + if (!Number.isFinite(quantity) || quantity <= 0) return null; + + const unit = normalizeUnit(typeof input.unit === 'string' ? input.unit : null); + if (!unit || !ALLOWED_UNITS.has(unit)) return null; + + const price = input.price == null ? null : Number(input.price); + + return { + rawName, + quantity, + unit, + price: Number.isFinite(price as number) ? (price as number) : null, + brand: typeof input.brand === 'string' ? input.brand.trim() || null : null, + origin: typeof input.origin === 'string' ? input.origin.trim() || null : null, + }; +} + // Regelbaserad parsning av en enstaka textrad från kvitto function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null { const normalized = line.toLowerCase(); + const nameCandidate = extractNameCandidate(line); + const rawName = nameCandidate ?? line; // Multipack: "3x120g", "2 x 1.5l" const multiPack = /(\d+)\s*[x×]\s*(\d+(?:[\.,]\d+)?)\s*(ml|cl|dl|l|g|kg)\b/i.exec(normalized); if (multiPack) { - return { rawName: line, quantity: 1, unit: 'förp', price: null, brand: null, origin: null }; + return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null }; } // Förpackad vara med volym/vikt i namn: "5dl", "1,5l", "100g" @@ -61,11 +211,12 @@ function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null { if (singlePack) { const qty = Number.parseFloat(singlePack[1].replace(',', '.')); const unit = singlePack[2].toLowerCase(); - // Lösvikt: kg/g utan "x" — returnera faktisk vikt - if ((unit === 'kg' || unit === 'g') && !normalized.includes('x')) { - return { rawName: line, quantity: qty, unit, price: null, brand: null, origin: null }; + const isMultipack = multiPack !== null; + // Lösvikt: kg/g och inte multipack — returnera faktisk vikt + if ((unit === 'kg' || unit === 'g') && !isMultipack) { + return { rawName, quantity: qty, unit, price: null, brand: null, origin: null }; } - return { rawName: line, quantity: 1, unit: 'förp', price: null, brand: null, origin: null }; + return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null }; } // Kan inte tolkas regelbaserat @@ -104,15 +255,12 @@ export class ReceiptParsingService { throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.'); } - const lines = text - .split('\n') - .map((l) => l.trim()) - .filter((l) => l.length > 2); + const combinedLines = preprocessPdfLines(text); const resolved: ParsedReceiptItemRaw[] = []; const needsAI: string[] = []; - for (const line of lines) { + for (const line of combinedLines) { const item = ruleBasedParseLine(line); if (item !== null) { resolved.push(item); @@ -165,8 +313,21 @@ export class ReceiptParsingService { private parseJsonResponse(data: any, source: string): ParsedReceiptItemRaw[] { try { const content: string = data?.choices?.[0]?.message?.content ?? ''; - const cleaned = content.replace(/` + '```' + `json|` + '```' + `/g, '').trim(); - return JSON.parse(cleaned) as ParsedReceiptItemRaw[]; + const cleaned = cleanJsonPayload(content); + const parsed = JSON.parse(cleaned); + if (!Array.isArray(parsed)) { + throw new Error('Svar är inte en JSON-array'); + } + + const normalized = parsed + .map((item) => normalizeParsedItem(item)) + .filter((item): item is ParsedReceiptItemRaw => item !== null); + + if (normalized.length === 0 && parsed.length > 0) { + throw new Error('Alla AI-poster underkändes i validering'); + } + + return normalized; } catch (err) { this.logger.error(`Kunde inte parsa Mistral-svar (${source}): ${err}`); throw new BadRequestException('AI-svaret kunde inte tolkas. Försök igen.'); @@ -175,14 +336,42 @@ export class ReceiptParsingService { private async callMistralWithRetry(body: object, apiKey: string, source: string): Promise { for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { - const response = await fetch(MISTRAL_API_URL, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${apiKey}`, - }, - body: JSON.stringify(body), - }); + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS); + let response: Response; + + try { + response = await fetch(MISTRAL_API_URL, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + } catch (err: any) { + clearTimeout(timeoutId); + const isAbort = err?.name === 'AbortError'; + this.logger.warn( + isAbort + ? `Mistral timeout (${source}, försök ${attempt}/${MAX_RETRIES})` + : `Mistral anrop misslyckades (${source}, försök ${attempt}/${MAX_RETRIES}): ${err}`, + ); + + if (attempt < MAX_RETRIES) { + await new Promise((resolve) => setTimeout(resolve, 1000 * attempt)); + continue; + } + + throw new ServiceUnavailableException( + isAbort + ? 'Mistral API svarade inte i tid. Försök igen.' + : 'Mistral API är tillfälligt otillgänglig. Försök igen.', + ); + } finally { + clearTimeout(timeoutId); + } if (response.status === 503 || response.status === 429) { const err = await response.text();