feat: enhance product matching logic with improved scoring and tokenization

This commit is contained in:
Nils-Johan Gynther
2026-05-02 00:05:09 +02:00
parent d3dac61765
commit d64f34f4ff
@@ -12,6 +12,26 @@ import { CategoriesService } from '../categories/categories.service';
const IMPORTER_SERVICE_URL =
process.env.IMPORTER_SERVICE_URL || 'http://importer-api:3001';
const WEAK_DESCRIPTORS = new Set([
'rokt',
'rökt',
'kokt',
'grillad',
'stekt',
'skivad',
'strimlad',
'fryst',
'farsk',
'färsk',
]);
function tokenize(value: string): string[] {
return value
.toLowerCase()
.split(/[^a-z0-9åäö]+/)
.filter((w) => w.length >= 3);
}
@Injectable()
export class ReceiptImportService {
private readonly logger = new Logger(ReceiptImportService.name);
@@ -120,20 +140,71 @@ export class ReceiptImportService {
products: { id: number; name: string; canonicalName: string | null }[],
): { id: number; name: string; canonicalName: string | null } | undefined {
// Dela upp kvittonamnet i ord (min 3 tecken)
const rawWords = raw.split(/[\s\-_]+/).filter((w) => w.length >= 3);
const rawWords = tokenize(raw);
if (rawWords.length === 0) return undefined;
// Fortsätt med att hitta produkter där ett produktnamn-ord finns i kvittonamnet
// Exempel: produktord "ost" finns i kvittoord "prästost", "herrgårdsost", "brieost"
return products.find((p) => {
const productWords = (p.canonicalName ?? p.name)
.toLowerCase()
.split(/[\s\-_]+/)
.filter((w) => w.length >= 3);
return productWords.some((pw) =>
rawWords.some((rw) => rw.includes(pw) || pw.includes(rw)),
);
});
const rawWordSet = new Set(rawWords);
let best:
| { product: { id: number; name: string; canonicalName: string | null }; score: number }
| undefined;
for (const product of products) {
const productWords = tokenize(product.canonicalName ?? product.name);
if (productWords.length === 0) continue;
let score = 0;
let exactStrong = 0;
let exactAny = 0;
let partialStrong = 0;
const phrase = (product.canonicalName ?? product.name).toLowerCase();
if (raw.includes(phrase)) {
score += 5;
}
for (const pw of productWords) {
const isWeak = WEAK_DESCRIPTORS.has(pw);
if (rawWordSet.has(pw)) {
exactAny += 1;
if (isWeak) {
score += 1;
} else {
exactStrong += 1;
score += 8;
}
continue;
}
// Delmatchning tillåts bara för ord med minst 4 tecken.
if (pw.length < 4) continue;
const hasPartial = rawWords.some((rw) => rw.includes(pw) || pw.includes(rw));
if (!hasPartial) continue;
if (isWeak) {
// Deskriptiva ord (t.ex. rökt) ska inte driva förslag ensamma.
continue;
}
partialStrong += 1;
score += 3;
}
// Kräv antingen minst ett starkt exakt ord, eller flera samverkande signaler.
const hasStrongSignal = exactStrong >= 1 || exactAny + partialStrong >= 2;
if (!hasStrongSignal) continue;
// Tröskel för att undvika svaga enkelträffar.
if (score < 8) continue;
if (!best || score > best.score) {
best = { product, score };
}
}
return best?.product;
}
private async enrichWithAiCategories(items: ParsedReceiptItem[]): Promise<ParsedReceiptItem[]> {