feat: enhance product matching logic with improved scoring and tokenization
This commit is contained in:
@@ -12,6 +12,26 @@ import { CategoriesService } from '../categories/categories.service';
|
||||
const IMPORTER_SERVICE_URL =
|
||||
process.env.IMPORTER_SERVICE_URL || 'http://importer-api:3001';
|
||||
|
||||
const WEAK_DESCRIPTORS = new Set([
|
||||
'rokt',
|
||||
'rökt',
|
||||
'kokt',
|
||||
'grillad',
|
||||
'stekt',
|
||||
'skivad',
|
||||
'strimlad',
|
||||
'fryst',
|
||||
'farsk',
|
||||
'färsk',
|
||||
]);
|
||||
|
||||
function tokenize(value: string): string[] {
|
||||
return value
|
||||
.toLowerCase()
|
||||
.split(/[^a-z0-9åäö]+/)
|
||||
.filter((w) => w.length >= 3);
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class ReceiptImportService {
|
||||
private readonly logger = new Logger(ReceiptImportService.name);
|
||||
@@ -120,20 +140,71 @@ export class ReceiptImportService {
|
||||
products: { id: number; name: string; canonicalName: string | null }[],
|
||||
): { id: number; name: string; canonicalName: string | null } | undefined {
|
||||
// Dela upp kvittonamnet i ord (min 3 tecken)
|
||||
const rawWords = raw.split(/[\s\-_]+/).filter((w) => w.length >= 3);
|
||||
const rawWords = tokenize(raw);
|
||||
if (rawWords.length === 0) return undefined;
|
||||
|
||||
// Fortsätt med att hitta produkter där ett produktnamn-ord finns i kvittonamnet
|
||||
// Exempel: produktord "ost" finns i kvittoord "prästost", "herrgårdsost", "brieost"
|
||||
return products.find((p) => {
|
||||
const productWords = (p.canonicalName ?? p.name)
|
||||
.toLowerCase()
|
||||
.split(/[\s\-_]+/)
|
||||
.filter((w) => w.length >= 3);
|
||||
return productWords.some((pw) =>
|
||||
rawWords.some((rw) => rw.includes(pw) || pw.includes(rw)),
|
||||
);
|
||||
});
|
||||
const rawWordSet = new Set(rawWords);
|
||||
|
||||
let best:
|
||||
| { product: { id: number; name: string; canonicalName: string | null }; score: number }
|
||||
| undefined;
|
||||
|
||||
for (const product of products) {
|
||||
const productWords = tokenize(product.canonicalName ?? product.name);
|
||||
if (productWords.length === 0) continue;
|
||||
|
||||
let score = 0;
|
||||
let exactStrong = 0;
|
||||
let exactAny = 0;
|
||||
let partialStrong = 0;
|
||||
|
||||
const phrase = (product.canonicalName ?? product.name).toLowerCase();
|
||||
if (raw.includes(phrase)) {
|
||||
score += 5;
|
||||
}
|
||||
|
||||
for (const pw of productWords) {
|
||||
const isWeak = WEAK_DESCRIPTORS.has(pw);
|
||||
|
||||
if (rawWordSet.has(pw)) {
|
||||
exactAny += 1;
|
||||
if (isWeak) {
|
||||
score += 1;
|
||||
} else {
|
||||
exactStrong += 1;
|
||||
score += 8;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Delmatchning tillåts bara för ord med minst 4 tecken.
|
||||
if (pw.length < 4) continue;
|
||||
|
||||
const hasPartial = rawWords.some((rw) => rw.includes(pw) || pw.includes(rw));
|
||||
if (!hasPartial) continue;
|
||||
|
||||
if (isWeak) {
|
||||
// Deskriptiva ord (t.ex. rökt) ska inte driva förslag ensamma.
|
||||
continue;
|
||||
}
|
||||
|
||||
partialStrong += 1;
|
||||
score += 3;
|
||||
}
|
||||
|
||||
// Kräv antingen minst ett starkt exakt ord, eller flera samverkande signaler.
|
||||
const hasStrongSignal = exactStrong >= 1 || exactAny + partialStrong >= 2;
|
||||
if (!hasStrongSignal) continue;
|
||||
|
||||
// Tröskel för att undvika svaga enkelträffar.
|
||||
if (score < 8) continue;
|
||||
|
||||
if (!best || score > best.score) {
|
||||
best = { product, score };
|
||||
}
|
||||
}
|
||||
|
||||
return best?.product;
|
||||
}
|
||||
|
||||
private async enrichWithAiCategories(items: ParsedReceiptItem[]): Promise<ParsedReceiptItem[]> {
|
||||
|
||||
Reference in New Issue
Block a user