Files
microservice-importer/backend/src/receipt-parsing/receipt-parsing.service.ts
T
Nils-Johan Gynther fa9bd141e0
Test Suite / test (24.15.0) (push) Has been cancelled
fix: use require() for pdf-parse and pdfjs-dist legacy build to fix Node 24 compat
2026-05-03 22:10:53 +02:00

460 lines
15 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import {
BadRequestException,
Injectable,
Logger,
ServiceUnavailableException,
} from '@nestjs/common';
// eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>;
const MISTRAL_API_URL = 'https://api.mistral.ai/v1/chat/completions';
const RECEIPT_VISION_MODEL = 'mistral-small-2603'; // vision — används för bild-input
const RECEIPT_TEXT_MODEL = 'mistral-small-latest'; // text — används som AI-fallback för PDF/OCR-text
const MAX_RETRIES = 3;
const REQUEST_TIMEOUT_MS = 25_000;
const ALLOWED_UNITS = new Set([
'st',
'kg',
'g',
'l',
'dl',
'cl',
'ml',
'förp',
'pak',
'burk',
'flaska',
]);
const QUANTITY_RULES = `
Regler för quantity och unit:
1. LÖSVIKT (chark, kött, ost, frukt/grönt vägt på kassabandet): quantity=faktisk vikt/volym från kvittot, unit=kg/g/l etc.
Exempel: "BLANDFÄRS 20%" köpt 0.997 kg -> quantity=0.997, unit="kg"
2. FÖRPACKAD VARA med storlek i namnet (mejeri, dryck, konserver, flingor): quantity=antal köpta förpackningar, unit="förp".
Exempel: "VISPGRÄDDE 5DL" köpt 1 -> quantity=1, unit="förp"
3. MULTIPACK (NxYg/NxYml i namnet): quantity=antal innerförpackningar (N), unit="förp".
Exempel: "BACON 3X120G" -> quantity=3, unit="förp"
4. FÖRPACKAT INNEHÅLL (bröd, kex, chips): quantity=antal köpta förpackningar, unit="förp".
5. LÖSA STYCKVAROR (enstaka frukt köpt lösvikt per styck): quantity=antal, unit="st".
`;
const IMAGE_PROMPT = `Du är en kvittoläsare. Analysera detta kvitto och returnera ENDAST en JSON-array med alla köpta varor.
Varje vara ska ha: "rawName", "quantity", "unit" (st/kg/g/l/dl/cl/ml/förp/pak/burk/flaska), "price" (SEK eller null), "brand" (eller null), "origin" (eller null).
${QUANTITY_RULES}
Returnera BARA JSON-arrayen utan markdown-formatering.`;
const buildTextPrompt = (text: string) =>
`Du är en kvittoläsare. Nedan följer rader från ett kvitto som regelbaserad parsning inte kunde tolka entydigt. Returnera ENDAST en JSON-array för dessa rader.
Varje vara ska ha: "rawName" (exakt som angett), "quantity", "unit" (st/kg/g/l/dl/cl/ml/förp/pak/burk/flaska), "price" (SEK eller null), "brand" (eller null), "origin" (eller null).
${QUANTITY_RULES}
Returnera BARA JSON-arrayen utan markdown-formatering.
Rader att tolka:
${text}`;
export interface ParsedReceiptItemRaw {
rawName: string;
quantity: number;
unit: string;
price?: number | null;
brand?: string | null;
origin?: string | null;
}
const NON_NAME_TOKENS = new Set([
'kr',
'sek',
'st',
'kg',
'g',
'mg',
'l',
'dl',
'cl',
'ml',
'moms',
'summa',
'rabatt',
'kort',
'kontant',
'totalt',
'att',
'betala',
'ore',
'öre',
]);
function isLikelyNameLikeText(value: string): boolean {
const tokens = value
.toLowerCase()
.split(/[^a-z0-9åäö]+/)
.map((t) => t.trim())
.filter((t) => t.length >= 3);
if (tokens.length === 0) {
return false;
}
const meaningful = tokens.filter((token) => {
if (NON_NAME_TOKENS.has(token)) return false;
if (/^\d+$/.test(token)) return false;
if (/^\d+(?:[\.,]\d+)?$/.test(token)) return false;
// Exkludera kvitto-token som "98kr", "997kg", "15st"
if (/^\d+(?:[\.,]\d+)?(?:kr|sek|kg|g|mg|l|dl|cl|ml|st|fp|pkt|pak|förp)$/.test(token)) return false;
return /[a-zåäö]/i.test(token);
});
return meaningful.length > 0;
}
function extractNameCandidate(line: string): string | null {
const cleaned = line
.replace(/\b\d+(?:[\.,]\d+)?\s*(kr|sek)\s*\/\s*(kg|g|mg|l|dl|cl|ml)\b/gi, ' ')
.replace(/\b(kr|sek)\s*\/\s*(kg|g|mg|l|dl|cl|ml)\b/gi, ' ')
.replace(/\b(kr|sek)\b/gi, ' ')
.replace(/\b\d+(?:[\.,]\d{2})\b/g, ' ')
.replace(/[|*]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
return isLikelyNameLikeText(cleaned) ? cleaned : null;
}
function preprocessPdfLines(text: string): string[] {
const lines = text
.split('\n')
.map((l) => l.trim())
.filter((l) => l.length > 2);
// Vanligt i PDF-kvitton: namnrad följs av separat rad med vikt/pris.
// Slå ihop en numerisk fortsättningsrad med föregående namnrad.
const combinedLines: string[] = [];
for (const line of lines) {
const normalizedLine = line.replace(/\s+/g, ' ').trim();
if (
combinedLines.length > 0 &&
!isLikelyNameLikeText(normalizedLine) &&
isLikelyNameLikeText(combinedLines[combinedLines.length - 1])
) {
combinedLines[combinedLines.length - 1] = `${combinedLines[combinedLines.length - 1]} ${normalizedLine}`;
} else {
combinedLines.push(normalizedLine);
}
}
return combinedLines;
}
function normalizeUnit(value: string | null | undefined): string | null {
if (!value) return null;
const unit = value.trim().toLowerCase().replace('.', '');
if (['forp', 'förpackning', 'forpackning', 'paket', 'pkt', 'fp', 'pack'].includes(unit)) return 'förp';
if (['styck'].includes(unit)) return 'st';
return unit;
}
function cleanJsonPayload(content: string): string {
const withoutFences = content
.replace(/```\s*json/gi, ' ')
.replace(/```/g, ' ')
.trim();
const firstArray = withoutFences.indexOf('[');
const lastArray = withoutFences.lastIndexOf(']');
if (firstArray !== -1 && lastArray > firstArray) {
return withoutFences.slice(firstArray, lastArray + 1).trim();
}
return withoutFences;
}
function hasAnyLetter(value: string): boolean {
return /[a-zåäö]/i.test(value);
}
function isIgnoredReceiptLine(value: string): boolean {
const normalized = value.trim().toLowerCase();
if (!normalized) return false;
// Rabatt- och betalningsrader ska aldrig bli varuposter.
if (/^rabatt\b/.test(normalized)) return true;
if (/^summa\b/.test(normalized)) return true;
if (/^moms\b/.test(normalized)) return true;
if (/^pant\b/.test(normalized)) return true;
if (/^att\s+betala\b/.test(normalized)) return true;
if (/^totalt\b/.test(normalized)) return true;
if (/^kort\b/.test(normalized)) return true;
if (/^kontant\b/.test(normalized)) return true;
if (/^willys\s+plus\s*[:\-]?\b/.test(normalized)) return true;
return false;
}
function normalizeParsedItem(input: any): ParsedReceiptItemRaw | null {
if (!input || typeof input !== 'object') return null;
const rawName = typeof input.rawName === 'string' ? input.rawName.trim() : '';
if (!rawName || !hasAnyLetter(rawName)) return null;
if (isIgnoredReceiptLine(rawName)) return null;
const quantity = Number(input.quantity);
if (!Number.isFinite(quantity) || quantity <= 0) return null;
const unit = normalizeUnit(typeof input.unit === 'string' ? input.unit : null);
if (!unit || !ALLOWED_UNITS.has(unit)) return null;
const price = input.price == null ? null : Number(input.price);
return {
rawName,
quantity,
unit,
price: Number.isFinite(price as number) ? (price as number) : null,
brand: typeof input.brand === 'string' ? input.brand.trim() || null : null,
origin: typeof input.origin === 'string' ? input.origin.trim() || null : null,
};
}
// Regelbaserad parsning av en enstaka textrad från kvitto
function ruleBasedParseLine(line: string): ParsedReceiptItemRaw | null {
const normalized = line.toLowerCase();
const nameCandidate = extractNameCandidate(line);
const rawName = nameCandidate ?? line;
if (isIgnoredReceiptLine(rawName)) {
return null;
}
// Multipack: "3x120g", "2 x 1.5l"
const multiPack = /(\d+)\s*[x×]\s*(\d+(?:[\.,]\d+)?)\s*(ml|cl|dl|l|g|kg)\b/i.exec(normalized);
if (multiPack) {
const count = Number.parseInt(multiPack[1], 10);
return {
rawName,
quantity: Number.isFinite(count) && count > 0 ? count : 1,
unit: 'förp',
price: null,
brand: null,
origin: null,
};
}
// Förpackad vara med volym/vikt i namn: "5dl", "1,5l", "100g"
const singlePack = /(\d+(?:[\.,]\d+)?)\s*(ml|cl|dl|l|g|kg)\b/i.exec(normalized);
if (singlePack) {
const qty = Number.parseFloat(singlePack[1].replace(',', '.'));
const unit = singlePack[2].toLowerCase();
const isMultipack = multiPack !== null;
// Lösvikt: kg/g och inte multipack — returnera faktisk vikt
if ((unit === 'kg' || unit === 'g') && !isMultipack) {
return { rawName, quantity: qty, unit, price: null, brand: null, origin: null };
}
return { rawName, quantity: 1, unit: 'förp', price: null, brand: null, origin: null };
}
// Kan inte tolkas regelbaserat
return null;
}
@Injectable()
export class ReceiptParsingService {
private readonly logger = new Logger(ReceiptParsingService.name);
async parseReceipt(file: Express.Multer.File): Promise<ParsedReceiptItemRaw[]> {
const apiKey = process.env.MISTRAL_API_KEY;
if (!apiKey) {
throw new ServiceUnavailableException('MISTRAL_API_KEY är inte konfigurerad');
}
const isPdf =
file.mimetype === 'application/pdf' ||
file.mimetype === 'application/octet-stream' ||
file.originalname?.toLowerCase().endsWith('.pdf');
if (isPdf) {
return this.parseReceiptFromPdf(file.buffer, apiKey);
}
return this.parseReceiptFromImage(file.buffer, file.mimetype, apiKey);
}
// PDF-flöde: text-extrahering -> regelbaserat -> AI-fallback för komplexa rader
private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise<ParsedReceiptItemRaw[]> {
let text: string;
try {
// Try pdf-parse first
const data = await pdfParse(buffer);
text = data.text;
} catch (err) {
this.logger.warn(`pdf-parse misslyckades: ${err}`);
// Fallback to pdfjs-dist legacy build (Node.js compatible, no DOMMatrix needed)
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const pdfjsLib = require('pdfjs-dist/legacy/build/pdf.js') as typeof import('pdfjs-dist');
pdfjsLib.GlobalWorkerOptions.workerSrc = '';
const loadingTask = pdfjsLib.getDocument({ data: new Uint8Array(buffer) });
const pdfDocument = await loadingTask.promise;
let fullText = '';
for (let i = 1; i <= pdfDocument.numPages; i++) {
const page = await pdfDocument.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: any) => (item.str ? item.str : ''))
.join(' ');
fullText += pageText + ' ';
}
text = fullText;
} catch (fallbackErr) {
this.logger.error(`Både pdf-parse och pdfjs-dist misslyckades: ${fallbackErr}`);
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.');
}
}
const combinedLines = preprocessPdfLines(text);
const resolved: ParsedReceiptItemRaw[] = [];
const needsAI: string[] = [];
for (const line of combinedLines) {
if (isIgnoredReceiptLine(line)) {
continue;
}
const item = ruleBasedParseLine(line);
if (item !== null) {
resolved.push(item);
} else {
needsAI.push(line);
}
}
this.logger.log(`PDF: ${resolved.length} rader lösta regelbaserat, ${needsAI.length} skickas till AI`);
if (needsAI.length > 0) {
const aiItems = await this.callMistralText(needsAI, apiKey);
resolved.push(...aiItems);
}
return resolved;
}
// Bild-flöde: Mistral vision (hela bilden)
private async parseReceiptFromImage(buffer: Buffer, mimetype: string, apiKey: string): Promise<ParsedReceiptItemRaw[]> {
const base64 = buffer.toString('base64');
const body = {
model: RECEIPT_VISION_MODEL,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: IMAGE_PROMPT },
{ type: 'image_url', image_url: { url: `data:${mimetype};base64,${base64}` } },
],
},
],
};
const response = await this.callMistralWithRetry(body, apiKey, 'image');
return this.parseJsonResponse(await response.json(), 'image');
}
// AI-fallback för enskilda textrader (text-modell, billigare än vision)
private async callMistralText(lines: string[], apiKey: string): Promise<ParsedReceiptItemRaw[]> {
const body = {
model: RECEIPT_TEXT_MODEL,
messages: [{ role: 'user', content: buildTextPrompt(lines.join('\n')) }],
};
const response = await this.callMistralWithRetry(body, apiKey, 'text-fallback');
return this.parseJsonResponse(await response.json(), 'text-fallback');
}
private parseJsonResponse(data: any, source: string): ParsedReceiptItemRaw[] {
try {
const content: string = data?.choices?.[0]?.message?.content ?? '';
const cleaned = cleanJsonPayload(content);
const parsed = JSON.parse(cleaned);
if (!Array.isArray(parsed)) {
throw new Error('Svar är inte en JSON-array');
}
const normalized = parsed
.map((item) => normalizeParsedItem(item))
.filter((item): item is ParsedReceiptItemRaw => item !== null);
if (normalized.length === 0 && parsed.length > 0) {
throw new Error('Alla AI-poster underkändes i validering');
}
return normalized;
} catch (err) {
this.logger.error(`Kunde inte parsa Mistral-svar (${source}): ${err}`);
throw new BadRequestException('AI-svaret kunde inte tolkas. Försök igen.');
}
}
private async callMistralWithRetry(body: object, apiKey: string, source: string): Promise<Response> {
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
let response: Response;
try {
response = await fetch(MISTRAL_API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify(body),
signal: controller.signal,
});
} catch (err: any) {
clearTimeout(timeoutId);
const isAbort = err?.name === 'AbortError';
this.logger.warn(
isAbort
? `Mistral timeout (${source}, försök ${attempt}/${MAX_RETRIES})`
: `Mistral anrop misslyckades (${source}, försök ${attempt}/${MAX_RETRIES}): ${err}`,
);
if (attempt < MAX_RETRIES) {
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt));
continue;
}
throw new ServiceUnavailableException(
isAbort
? 'Mistral API svarade inte i tid. Försök igen.'
: 'Mistral API är tillfälligt otillgänglig. Försök igen.',
);
} finally {
clearTimeout(timeoutId);
}
if (response.status === 503 || response.status === 429) {
const err = await response.text();
this.logger.warn(`Mistral ${response.status} (${source}, försök ${attempt}/${MAX_RETRIES}): ${err}`);
if (attempt < MAX_RETRIES) {
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt));
continue;
}
throw new ServiceUnavailableException('Mistral API är tillfälligt otillgänglig. Försök igen.');
}
if (!response.ok) {
const err = await response.text();
this.logger.error(`Mistral ${response.status} (${source}): ${err}`);
throw new BadRequestException(`Mistral API svarade med fel: ${response.status}`);
}
return response;
}
throw new ServiceUnavailableException('Mistral API misslyckades efter max antal försök');
}
}