feat: Enhance receipt parsing with additional preprocessing functions and improved AI response handling

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Nils-Johan Gynther
2026-05-03 11:19:58 +02:00
parent fa27c4d4de
commit f270f8510e
@@ -10,6 +10,21 @@ const MISTRAL_API_URL = 'https://api.mistral.ai/v1/chat/completions';
const RECEIPT_VISION_MODEL = 'mistral-small-2603'; // vision — används för bild-input
const RECEIPT_TEXT_MODEL = 'mistral-small-latest'; // text — används som AI-fallback för PDF/OCR-text
const MAX_RETRIES = 3;
const REQUEST_TIMEOUT_MS = 25_000;
const ALLOWED_UNITS = new Set([
'st',
'kg',
'g',
'l',
'dl',
'cl',
'ml',
'förp',
'pak',
'burk',
'flaska',
]);
const QUANTITY_RULES = `
Regler för quantity och unit:
@@ -46,14 +61,149 @@ export interface ParsedReceiptItemRaw {
origin?: string | null;
}
const NON_NAME_TOKENS = new Set([
'kr',
'sek',
'st',
'kg',
'g',
'mg',
'l',
'dl',
'cl',
'ml',
'moms',
'summa',
'rabatt',
'kort',
'kontant',
'totalt',
'att',
'betala',
'ore',
'öre',
]);
function isLikelyNameLikeText(value: string): boolean {
const tokens = value
.toLowerCase()
.split(/[^a-z0-9åäö]+/)
.map((t) => t.trim())
.filter((t) => t.length >= 3);
if (tokens.length === 0) {
return false;
}
const meaningful = tokens.filter((token) => {
if (NON_NAME_TOKENS.has(token)) return false;
if (/^\d+$/.test(token)) return false;
if (/^\d+(?:[\.,]\d+)?$/.test(token)) return false;
return /[a-zåäö]/i.test(token);
});
return meaningful.length > 0;
}
function extractNameCandidate(line: string): string | null {
const cleaned = line
.replace(/\b\d+\s*[x×]\s*\d+(?:[\.,]\d+)?\s*(ml|cl|dl|l|g|kg)\b/gi, ' ')
.replace(/\b\d+(?:[\.,]\d+)?\s*(ml|cl|dl|l|g|kg|st|fp|pkt|pak|förp)\b/gi, ' ')
.replace(/\b\d+(?:[\.,]\d{2})\b/g, ' ')
.replace(/[|*]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
return isLikelyNameLikeText(cleaned) ? cleaned : null;
}
function preprocessPdfLines(text: string): string[] {
const lines = text
.split('\n')
.map((l) => l.trim())
.filter((l) => l.length > 2);
// Vanligt i PDF-kvitton: namnrad följs av separat rad med vikt/pris.
// Slå ihop en numerisk fortsättningsrad med föregående namnrad.
const combinedLines: string[] = [];
for (const line of lines) {
const normalizedLine = line.replace(/\s+/g, ' ').trim();
if (
combinedLines.length > 0 &&
!isLikelyNameLikeText(normalizedLine) &&
isLikelyNameLikeText(combinedLines[combinedLines.length - 1])
) {
combinedLines[combinedLines.length - 1] = `${combinedLines[combinedLines.length - 1]} ${normalizedLine}`;
} else {
combinedLines.push(normalizedLine);
}
}
return combinedLines;
}
function normalizeUnit(value: string | null | undefined): string | null {
if (!value) return null;
const unit = value.trim().toLowerCase().replace('.', '');
if (['forp', 'förpackning', 'forpackning', 'paket', 'pkt', 'fp', 'pack'].includes(unit)) return 'förp';
if (['styck'].includes(unit)) return 'st';
return unit;
}
function cleanJsonPayload(content: string): string {
const withoutFences = content
.replace(/```\s*json/gi, ' ')
.replace(/```/g, ' ')
.trim();
const firstArray = withoutFences.indexOf('[');
const lastArray = withoutFences.lastIndexOf(']');
if (firstArray !== -1 && lastArray > firstArray) {
return withoutFences.slice(firstArray, lastArray + 1).trim();
}
return withoutFences;
}
function hasAnyLetter(value: string): boolean {
return /[a-zåäö]/i.test(value);
}
function normalizeParsedItem(input: any): ParsedReceiptItemRaw | null {
if (!input || typeof input !== 'object') return null;
const rawName = typeof input.rawName === 'string' ? input.rawName.trim() : '';
if (!rawName || !hasAnyLetter(rawName)) return null;
const quantity = Number(input.quantity);
if (!Number.isFinite(quantity) || quantity <= 0) return null;
const unit = normalizeUnit(typeof input.unit === 'string' ? input.unit : null);
if (!unit || !ALLOWED_UNITS.has(unit)) return null;
const price = input.price == null ? null : Number(input.price);
return {
rawName,
quantity,
unit,
price: Number.isFinite(price as number) ? (price as number) : null,
brand: typeof input.brand === 'string' ? input.brand.trim() || null : null,
origin: typeof input.origin === 'string' ? input.origin.trim() || null : null,
};
}
// Regelbaserad parsning av en enstaka textrad från kvitto
function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null {
const normalized = line.toLowerCase();
const nameCandidate = extractNameCandidate(line);
const rawName = nameCandidate ?? line;
// Multipack: "3x120g", "2 x 1.5l"
const multiPack = /(\d+)\s*[x×]\s*(\d+(?:[\.,]\d+)?)\s*(ml|cl|dl|l|g|kg)\b/i.exec(normalized);
if (multiPack) {
return { rawName: line, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
}
// Förpackad vara med volym/vikt i namn: "5dl", "1,5l", "100g"
@@ -61,11 +211,12 @@ function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null {
if (singlePack) {
const qty = Number.parseFloat(singlePack[1].replace(',', '.'));
const unit = singlePack[2].toLowerCase();
// Lösvikt: kg/g utan "x" — returnera faktisk vikt
if ((unit === 'kg' || unit === 'g') && !normalized.includes('x')) {
return { rawName: line, quantity: qty, unit, price: null, brand: null, origin: null };
const isMultipack = multiPack !== null;
// Lösvikt: kg/g och inte multipack — returnera faktisk vikt
if ((unit === 'kg' || unit === 'g') && !isMultipack) {
return { rawName, quantity: qty, unit, price: null, brand: null, origin: null };
}
return { rawName: line, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
}
// Kan inte tolkas regelbaserat
@@ -104,15 +255,12 @@ export class ReceiptParsingService {
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.');
}
const lines = text
.split('\n')
.map((l) => l.trim())
.filter((l) => l.length > 2);
const combinedLines = preprocessPdfLines(text);
const resolved: ParsedReceiptItemRaw[] = [];
const needsAI: string[] = [];
for (const line of lines) {
for (const line of combinedLines) {
const item = ruleBasedParseLine(line);
if (item !== null) {
resolved.push(item);
@@ -165,8 +313,21 @@ export class ReceiptParsingService {
private parseJsonResponse(data: any, source: string): ParsedReceiptItemRaw[] {
try {
const content: string = data?.choices?.[0]?.message?.content ?? '';
const cleaned = content.replace(/` + '```' + `json|` + '```' + `/g, '').trim();
return JSON.parse(cleaned) as ParsedReceiptItemRaw[];
const cleaned = cleanJsonPayload(content);
const parsed = JSON.parse(cleaned);
if (!Array.isArray(parsed)) {
throw new Error('Svar är inte en JSON-array');
}
const normalized = parsed
.map((item) => normalizeParsedItem(item))
.filter((item): item is ParsedReceiptItemRaw => item !== null);
if (normalized.length === 0 && parsed.length > 0) {
throw new Error('Alla AI-poster underkändes i validering');
}
return normalized;
} catch (err) {
this.logger.error(`Kunde inte parsa Mistral-svar (${source}): ${err}`);
throw new BadRequestException('AI-svaret kunde inte tolkas. Försök igen.');
@@ -175,14 +336,42 @@ export class ReceiptParsingService {
private async callMistralWithRetry(body: object, apiKey: string, source: string): Promise<Response> {
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
const response = await fetch(MISTRAL_API_URL, {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
let response: Response;
try {
response = await fetch(MISTRAL_API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify(body),
signal: controller.signal,
});
} catch (err: any) {
clearTimeout(timeoutId);
const isAbort = err?.name === 'AbortError';
this.logger.warn(
isAbort
? `Mistral timeout (${source}, försök ${attempt}/${MAX_RETRIES})`
: `Mistral anrop misslyckades (${source}, försök ${attempt}/${MAX_RETRIES}): ${err}`,
);
if (attempt < MAX_RETRIES) {
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt));
continue;
}
throw new ServiceUnavailableException(
isAbort
? 'Mistral API svarade inte i tid. Försök igen.'
: 'Mistral API är tillfälligt otillgänglig. Försök igen.',
);
} finally {
clearTimeout(timeoutId);
}
if (response.status === 503 || response.status === 429) {
const err = await response.text();