feat: Enhance receipt parsing with additional preprocessing functions and improved AI response handling
Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
@@ -10,6 +10,21 @@ const MISTRAL_API_URL = 'https://api.mistral.ai/v1/chat/completions';
|
|||||||
const RECEIPT_VISION_MODEL = 'mistral-small-2603'; // vision — används för bild-input
|
const RECEIPT_VISION_MODEL = 'mistral-small-2603'; // vision — används för bild-input
|
||||||
const RECEIPT_TEXT_MODEL = 'mistral-small-latest'; // text — används som AI-fallback för PDF/OCR-text
|
const RECEIPT_TEXT_MODEL = 'mistral-small-latest'; // text — används som AI-fallback för PDF/OCR-text
|
||||||
const MAX_RETRIES = 3;
|
const MAX_RETRIES = 3;
|
||||||
|
const REQUEST_TIMEOUT_MS = 25_000;
|
||||||
|
|
||||||
|
const ALLOWED_UNITS = new Set([
|
||||||
|
'st',
|
||||||
|
'kg',
|
||||||
|
'g',
|
||||||
|
'l',
|
||||||
|
'dl',
|
||||||
|
'cl',
|
||||||
|
'ml',
|
||||||
|
'förp',
|
||||||
|
'pak',
|
||||||
|
'burk',
|
||||||
|
'flaska',
|
||||||
|
]);
|
||||||
|
|
||||||
const QUANTITY_RULES = `
|
const QUANTITY_RULES = `
|
||||||
Regler för quantity och unit:
|
Regler för quantity och unit:
|
||||||
@@ -46,14 +61,149 @@ export interface ParsedReceiptItemRaw {
|
|||||||
origin?: string | null;
|
origin?: string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const NON_NAME_TOKENS = new Set([
|
||||||
|
'kr',
|
||||||
|
'sek',
|
||||||
|
'st',
|
||||||
|
'kg',
|
||||||
|
'g',
|
||||||
|
'mg',
|
||||||
|
'l',
|
||||||
|
'dl',
|
||||||
|
'cl',
|
||||||
|
'ml',
|
||||||
|
'moms',
|
||||||
|
'summa',
|
||||||
|
'rabatt',
|
||||||
|
'kort',
|
||||||
|
'kontant',
|
||||||
|
'totalt',
|
||||||
|
'att',
|
||||||
|
'betala',
|
||||||
|
'ore',
|
||||||
|
'öre',
|
||||||
|
]);
|
||||||
|
|
||||||
|
function isLikelyNameLikeText(value: string): boolean {
|
||||||
|
const tokens = value
|
||||||
|
.toLowerCase()
|
||||||
|
.split(/[^a-z0-9åäö]+/)
|
||||||
|
.map((t) => t.trim())
|
||||||
|
.filter((t) => t.length >= 3);
|
||||||
|
|
||||||
|
if (tokens.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const meaningful = tokens.filter((token) => {
|
||||||
|
if (NON_NAME_TOKENS.has(token)) return false;
|
||||||
|
if (/^\d+$/.test(token)) return false;
|
||||||
|
if (/^\d+(?:[\.,]\d+)?$/.test(token)) return false;
|
||||||
|
return /[a-zåäö]/i.test(token);
|
||||||
|
});
|
||||||
|
|
||||||
|
return meaningful.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractNameCandidate(line: string): string | null {
|
||||||
|
const cleaned = line
|
||||||
|
.replace(/\b\d+\s*[x×]\s*\d+(?:[\.,]\d+)?\s*(ml|cl|dl|l|g|kg)\b/gi, ' ')
|
||||||
|
.replace(/\b\d+(?:[\.,]\d+)?\s*(ml|cl|dl|l|g|kg|st|fp|pkt|pak|förp)\b/gi, ' ')
|
||||||
|
.replace(/\b\d+(?:[\.,]\d{2})\b/g, ' ')
|
||||||
|
.replace(/[|*]/g, ' ')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
return isLikelyNameLikeText(cleaned) ? cleaned : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function preprocessPdfLines(text: string): string[] {
|
||||||
|
const lines = text
|
||||||
|
.split('\n')
|
||||||
|
.map((l) => l.trim())
|
||||||
|
.filter((l) => l.length > 2);
|
||||||
|
|
||||||
|
// Vanligt i PDF-kvitton: namnrad följs av separat rad med vikt/pris.
|
||||||
|
// Slå ihop en numerisk fortsättningsrad med föregående namnrad.
|
||||||
|
const combinedLines: string[] = [];
|
||||||
|
for (const line of lines) {
|
||||||
|
const normalizedLine = line.replace(/\s+/g, ' ').trim();
|
||||||
|
if (
|
||||||
|
combinedLines.length > 0 &&
|
||||||
|
!isLikelyNameLikeText(normalizedLine) &&
|
||||||
|
isLikelyNameLikeText(combinedLines[combinedLines.length - 1])
|
||||||
|
) {
|
||||||
|
combinedLines[combinedLines.length - 1] = `${combinedLines[combinedLines.length - 1]} ${normalizedLine}`;
|
||||||
|
} else {
|
||||||
|
combinedLines.push(normalizedLine);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return combinedLines;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeUnit(value: string | null | undefined): string | null {
|
||||||
|
if (!value) return null;
|
||||||
|
|
||||||
|
const unit = value.trim().toLowerCase().replace('.', '');
|
||||||
|
if (['forp', 'förpackning', 'forpackning', 'paket', 'pkt', 'fp', 'pack'].includes(unit)) return 'förp';
|
||||||
|
if (['styck'].includes(unit)) return 'st';
|
||||||
|
return unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanJsonPayload(content: string): string {
|
||||||
|
const withoutFences = content
|
||||||
|
.replace(/```\s*json/gi, ' ')
|
||||||
|
.replace(/```/g, ' ')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const firstArray = withoutFences.indexOf('[');
|
||||||
|
const lastArray = withoutFences.lastIndexOf(']');
|
||||||
|
if (firstArray !== -1 && lastArray > firstArray) {
|
||||||
|
return withoutFences.slice(firstArray, lastArray + 1).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return withoutFences;
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAnyLetter(value: string): boolean {
|
||||||
|
return /[a-zåäö]/i.test(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeParsedItem(input: any): ParsedReceiptItemRaw | null {
|
||||||
|
if (!input || typeof input !== 'object') return null;
|
||||||
|
|
||||||
|
const rawName = typeof input.rawName === 'string' ? input.rawName.trim() : '';
|
||||||
|
if (!rawName || !hasAnyLetter(rawName)) return null;
|
||||||
|
|
||||||
|
const quantity = Number(input.quantity);
|
||||||
|
if (!Number.isFinite(quantity) || quantity <= 0) return null;
|
||||||
|
|
||||||
|
const unit = normalizeUnit(typeof input.unit === 'string' ? input.unit : null);
|
||||||
|
if (!unit || !ALLOWED_UNITS.has(unit)) return null;
|
||||||
|
|
||||||
|
const price = input.price == null ? null : Number(input.price);
|
||||||
|
|
||||||
|
return {
|
||||||
|
rawName,
|
||||||
|
quantity,
|
||||||
|
unit,
|
||||||
|
price: Number.isFinite(price as number) ? (price as number) : null,
|
||||||
|
brand: typeof input.brand === 'string' ? input.brand.trim() || null : null,
|
||||||
|
origin: typeof input.origin === 'string' ? input.origin.trim() || null : null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Regelbaserad parsning av en enstaka textrad från kvitto
|
// Regelbaserad parsning av en enstaka textrad från kvitto
|
||||||
function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null {
|
function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null {
|
||||||
const normalized = line.toLowerCase();
|
const normalized = line.toLowerCase();
|
||||||
|
const nameCandidate = extractNameCandidate(line);
|
||||||
|
const rawName = nameCandidate ?? line;
|
||||||
|
|
||||||
// Multipack: "3x120g", "2 x 1.5l"
|
// Multipack: "3x120g", "2 x 1.5l"
|
||||||
const multiPack = /(\d+)\s*[x×]\s*(\d+(?:[\.,]\d+)?)\s*(ml|cl|dl|l|g|kg)\b/i.exec(normalized);
|
const multiPack = /(\d+)\s*[x×]\s*(\d+(?:[\.,]\d+)?)\s*(ml|cl|dl|l|g|kg)\b/i.exec(normalized);
|
||||||
if (multiPack) {
|
if (multiPack) {
|
||||||
return { rawName: line, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
|
return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Förpackad vara med volym/vikt i namn: "5dl", "1,5l", "100g"
|
// Förpackad vara med volym/vikt i namn: "5dl", "1,5l", "100g"
|
||||||
@@ -61,11 +211,12 @@ function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null {
|
|||||||
if (singlePack) {
|
if (singlePack) {
|
||||||
const qty = Number.parseFloat(singlePack[1].replace(',', '.'));
|
const qty = Number.parseFloat(singlePack[1].replace(',', '.'));
|
||||||
const unit = singlePack[2].toLowerCase();
|
const unit = singlePack[2].toLowerCase();
|
||||||
// Lösvikt: kg/g utan "x" — returnera faktisk vikt
|
const isMultipack = multiPack !== null;
|
||||||
if ((unit === 'kg' || unit === 'g') && !normalized.includes('x')) {
|
// Lösvikt: kg/g och inte multipack — returnera faktisk vikt
|
||||||
return { rawName: line, quantity: qty, unit, price: null, brand: null, origin: null };
|
if ((unit === 'kg' || unit === 'g') && !isMultipack) {
|
||||||
|
return { rawName, quantity: qty, unit, price: null, brand: null, origin: null };
|
||||||
}
|
}
|
||||||
return { rawName: line, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
|
return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Kan inte tolkas regelbaserat
|
// Kan inte tolkas regelbaserat
|
||||||
@@ -104,15 +255,12 @@ export class ReceiptParsingService {
|
|||||||
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.');
|
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.');
|
||||||
}
|
}
|
||||||
|
|
||||||
const lines = text
|
const combinedLines = preprocessPdfLines(text);
|
||||||
.split('\n')
|
|
||||||
.map((l) => l.trim())
|
|
||||||
.filter((l) => l.length > 2);
|
|
||||||
|
|
||||||
const resolved: ParsedReceiptItemRaw[] = [];
|
const resolved: ParsedReceiptItemRaw[] = [];
|
||||||
const needsAI: string[] = [];
|
const needsAI: string[] = [];
|
||||||
|
|
||||||
for (const line of lines) {
|
for (const line of combinedLines) {
|
||||||
const item = ruleBasedParseLine(line);
|
const item = ruleBasedParseLine(line);
|
||||||
if (item !== null) {
|
if (item !== null) {
|
||||||
resolved.push(item);
|
resolved.push(item);
|
||||||
@@ -165,8 +313,21 @@ export class ReceiptParsingService {
|
|||||||
private parseJsonResponse(data: any, source: string): ParsedReceiptItemRaw[] {
|
private parseJsonResponse(data: any, source: string): ParsedReceiptItemRaw[] {
|
||||||
try {
|
try {
|
||||||
const content: string = data?.choices?.[0]?.message?.content ?? '';
|
const content: string = data?.choices?.[0]?.message?.content ?? '';
|
||||||
const cleaned = content.replace(/` + '```' + `json|` + '```' + `/g, '').trim();
|
const cleaned = cleanJsonPayload(content);
|
||||||
return JSON.parse(cleaned) as ParsedReceiptItemRaw[];
|
const parsed = JSON.parse(cleaned);
|
||||||
|
if (!Array.isArray(parsed)) {
|
||||||
|
throw new Error('Svar är inte en JSON-array');
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalized = parsed
|
||||||
|
.map((item) => normalizeParsedItem(item))
|
||||||
|
.filter((item): item is ParsedReceiptItemRaw => item !== null);
|
||||||
|
|
||||||
|
if (normalized.length === 0 && parsed.length > 0) {
|
||||||
|
throw new Error('Alla AI-poster underkändes i validering');
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.error(`Kunde inte parsa Mistral-svar (${source}): ${err}`);
|
this.logger.error(`Kunde inte parsa Mistral-svar (${source}): ${err}`);
|
||||||
throw new BadRequestException('AI-svaret kunde inte tolkas. Försök igen.');
|
throw new BadRequestException('AI-svaret kunde inte tolkas. Försök igen.');
|
||||||
@@ -175,14 +336,42 @@ export class ReceiptParsingService {
|
|||||||
|
|
||||||
private async callMistralWithRetry(body: object, apiKey: string, source: string): Promise<Response> {
|
private async callMistralWithRetry(body: object, apiKey: string, source: string): Promise<Response> {
|
||||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||||
const response = await fetch(MISTRAL_API_URL, {
|
const controller = new AbortController();
|
||||||
method: 'POST',
|
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
|
||||||
headers: {
|
let response: Response;
|
||||||
'Content-Type': 'application/json',
|
|
||||||
Authorization: `Bearer ${apiKey}`,
|
try {
|
||||||
},
|
response = await fetch(MISTRAL_API_URL, {
|
||||||
body: JSON.stringify(body),
|
method: 'POST',
|
||||||
});
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
Authorization: `Bearer ${apiKey}`,
|
||||||
|
},
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
signal: controller.signal,
|
||||||
|
});
|
||||||
|
} catch (err: any) {
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
const isAbort = err?.name === 'AbortError';
|
||||||
|
this.logger.warn(
|
||||||
|
isAbort
|
||||||
|
? `Mistral timeout (${source}, försök ${attempt}/${MAX_RETRIES})`
|
||||||
|
: `Mistral anrop misslyckades (${source}, försök ${attempt}/${MAX_RETRIES}): ${err}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (attempt < MAX_RETRIES) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new ServiceUnavailableException(
|
||||||
|
isAbort
|
||||||
|
? 'Mistral API svarade inte i tid. Försök igen.'
|
||||||
|
: 'Mistral API är tillfälligt otillgänglig. Försök igen.',
|
||||||
|
);
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
}
|
||||||
|
|
||||||
if (response.status === 503 || response.status === 429) {
|
if (response.status === 503 || response.status === 429) {
|
||||||
const err = await response.text();
|
const err = await response.text();
|
||||||
|
|||||||
Reference in New Issue
Block a user