From d64f34f4ff10147daefd1b916afa328b1375277f Mon Sep 17 00:00:00 2001 From: Nils-Johan Gynther Date: Sat, 2 May 2026 00:05:09 +0200 Subject: [PATCH] feat: enhance product matching logic with improved scoring and tokenization --- .../receipt-import/receipt-import.service.ts | 95 ++++++++++++++++--- 1 file changed, 83 insertions(+), 12 deletions(-) diff --git a/backend/src/receipt-import/receipt-import.service.ts b/backend/src/receipt-import/receipt-import.service.ts index 305c0234..fb8ba085 100644 --- a/backend/src/receipt-import/receipt-import.service.ts +++ b/backend/src/receipt-import/receipt-import.service.ts @@ -12,6 +12,26 @@ import { CategoriesService } from '../categories/categories.service'; const IMPORTER_SERVICE_URL = process.env.IMPORTER_SERVICE_URL || 'http://importer-api:3001'; +const WEAK_DESCRIPTORS = new Set([ + 'rokt', + 'rökt', + 'kokt', + 'grillad', + 'stekt', + 'skivad', + 'strimlad', + 'fryst', + 'farsk', + 'färsk', +]); + +function tokenize(value: string): string[] { + return value + .toLowerCase() + .split(/[^a-z0-9åäö]+/) + .filter((w) => w.length >= 3); +} + @Injectable() export class ReceiptImportService { private readonly logger = new Logger(ReceiptImportService.name); @@ -120,20 +140,71 @@ export class ReceiptImportService { products: { id: number; name: string; canonicalName: string | null }[], ): { id: number; name: string; canonicalName: string | null } | undefined { // Dela upp kvittonamnet i ord (min 3 tecken) - const rawWords = raw.split(/[\s\-_]+/).filter((w) => w.length >= 3); + const rawWords = tokenize(raw); if (rawWords.length === 0) return undefined; - // Fortsätt med att hitta produkter där ett produktnamn-ord finns i kvittonamnet - // Exempel: produktord "ost" finns i kvittoord "prästost", "herrgårdsost", "brieost" - return products.find((p) => { - const productWords = (p.canonicalName ?? p.name) - .toLowerCase() - .split(/[\s\-_]+/) - .filter((w) => w.length >= 3); - return productWords.some((pw) => - rawWords.some((rw) => rw.includes(pw) || pw.includes(rw)), - ); - }); + const rawWordSet = new Set(rawWords); + + let best: + | { product: { id: number; name: string; canonicalName: string | null }; score: number } + | undefined; + + for (const product of products) { + const productWords = tokenize(product.canonicalName ?? product.name); + if (productWords.length === 0) continue; + + let score = 0; + let exactStrong = 0; + let exactAny = 0; + let partialStrong = 0; + + const phrase = (product.canonicalName ?? product.name).toLowerCase(); + if (raw.includes(phrase)) { + score += 5; + } + + for (const pw of productWords) { + const isWeak = WEAK_DESCRIPTORS.has(pw); + + if (rawWordSet.has(pw)) { + exactAny += 1; + if (isWeak) { + score += 1; + } else { + exactStrong += 1; + score += 8; + } + continue; + } + + // Delmatchning tillåts bara för ord med minst 4 tecken. + if (pw.length < 4) continue; + + const hasPartial = rawWords.some((rw) => rw.includes(pw) || pw.includes(rw)); + if (!hasPartial) continue; + + if (isWeak) { + // Deskriptiva ord (t.ex. rökt) ska inte driva förslag ensamma. + continue; + } + + partialStrong += 1; + score += 3; + } + + // Kräv antingen minst ett starkt exakt ord, eller flera samverkande signaler. + const hasStrongSignal = exactStrong >= 1 || exactAny + partialStrong >= 2; + if (!hasStrongSignal) continue; + + // Tröskel för att undvika svaga enkelträffar. + if (score < 8) continue; + + if (!best || score > best.score) { + best = { product, score }; + } + } + + return best?.product; } private async enrichWithAiCategories(items: ParsedReceiptItem[]): Promise {