feat(flyer-parsing): add flyer parsing module
Test Suite / test (24.15.0) (push) Has been cancelled

- Added new FlyerParsingModule to the application
- Updated AppModule to import the new FlyerParsingModule
- Added new directory structure for flyer-parsing module
This commit is contained in:
Nils-Johan Gynther
2026-05-18 18:40:31 +02:00
parent cd830b9de8
commit 0b69683080
6 changed files with 285 additions and 10 deletions
+2
View File
@@ -3,6 +3,7 @@ import { WebScrapingModule } from './web-scraping-service/web-scraping.module';
import { RecipesModule } from './recipes/recipes.module'; import { RecipesModule } from './recipes/recipes.module';
import { DocumentServiceModule } from './document-service/document-service.module'; import { DocumentServiceModule } from './document-service/document-service.module';
import { ReceiptParsingModule } from './receipt-parsing/receipt-parsing.module'; import { ReceiptParsingModule } from './receipt-parsing/receipt-parsing.module';
import { FlyerParsingModule } from './flyer-parsing/flyer-parsing.module';
@Controller('health') @Controller('health')
class HealthController { class HealthController {
@@ -18,6 +19,7 @@ class HealthController {
WebScrapingModule, WebScrapingModule,
RecipesModule, RecipesModule,
ReceiptParsingModule, ReceiptParsingModule,
FlyerParsingModule,
], ],
controllers: [HealthController], controllers: [HealthController],
}) })
@@ -0,0 +1,13 @@
import { IsIn, IsOptional, IsString, MinLength } from 'class-validator';
export class ParseFlyerDto {
@IsOptional()
@IsString()
@MinLength(20)
text?: string;
@IsOptional()
@IsString()
@IsIn(['willys'])
retailer?: 'willys';
}
@@ -0,0 +1,58 @@
import {
BadRequestException,
Body,
Controller,
HttpCode,
Post,
UploadedFile,
UseInterceptors,
} from '@nestjs/common';
import { FileInterceptor } from '@nestjs/platform-express';
import { memoryStorage } from 'multer';
import { ParseFlyerDto } from './dto/parse-flyer.dto';
import { FlyerParseResponse, FlyerParsingService } from './flyer-parsing.service';
const ALLOWED_UPLOAD_MIMES = new Set([
'application/pdf',
'application/octet-stream',
'text/plain',
]);
@Controller('flyer')
export class FlyerParsingController {
constructor(private readonly flyerParsingService: FlyerParsingService) {}
@Post('parse')
@HttpCode(200)
@UseInterceptors(
FileInterceptor('file', {
storage: memoryStorage(),
limits: { fileSize: 15 * 1024 * 1024 },
fileFilter: (_req, file, cb) => {
if (ALLOWED_UPLOAD_MIMES.has(file.mimetype)) {
cb(null, true);
return;
}
cb(
new BadRequestException('Otillåten filtyp för flyer-parser. Stöd: PDF eller textfil.'),
false,
);
},
}),
)
async parseFlyer(
@Body() body: ParseFlyerDto,
@UploadedFile() file?: Express.Multer.File,
): Promise<FlyerParseResponse> {
if (!file && !body?.text?.trim()) {
throw new BadRequestException('Skicka antingen fil under "file" eller text i body.text.');
}
const text = body?.text?.trim();
return this.flyerParsingService.parseFlyer({
file,
text,
retailer: body?.retailer ?? 'willys',
});
}
}
@@ -0,0 +1,9 @@
import { Module } from '@nestjs/common';
import { FlyerParsingController } from './flyer-parsing.controller';
import { FlyerParsingService } from './flyer-parsing.service';
@Module({
controllers: [FlyerParsingController],
providers: [FlyerParsingService],
})
export class FlyerParsingModule {}
@@ -0,0 +1,193 @@
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
// eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>;
import { normalizeName } from '../common/utils/normalize-name';
const CATEGORY_REGEX = /^(Fisk|Kott|Kött|Mejeri|Gronsaker|Grönsaker|Frukt|Dryck|Brod|Bröd|Pasta|Ris)\b/i;
export type FlyerParseItem = {
rawName: string;
normalizedName: string;
category: string | null;
price: number | null;
priceUnit: string | null;
comparisonPrice: number | null;
comparisonUnit: string | null;
offerText: string | null;
confidence: number;
reasonCodes: string[];
};
export type FlyerParseResponse = {
retailer: 'willys';
parserVersion: 'v1';
items: FlyerParseItem[];
warnings: string[];
};
@Injectable()
export class FlyerParsingService {
private readonly logger = new Logger(FlyerParsingService.name);
async parseFlyer(args: {
file?: Express.Multer.File;
text?: string;
retailer?: 'willys';
}): Promise<FlyerParseResponse> {
const retailer = args.retailer ?? 'willys';
const warnings: string[] = [];
const text = args.text?.trim() || (args.file ? await this.extractTextFromFile(args.file, warnings) : '');
if (!text) {
throw new BadRequestException('Ingen text kunde extraheras från underlaget.');
}
const items = this.parseWillysText(text);
if (items.length === 0) {
warnings.push('Inga produkter kunde tolkas. Kontrollera PDF-kvalitet eller textformat.');
}
return {
retailer,
parserVersion: 'v1',
items,
warnings,
};
}
private async extractTextFromFile(file: Express.Multer.File, warnings: string[]): Promise<string> {
const isPdf =
file.mimetype === 'application/pdf' ||
file.mimetype === 'application/octet-stream' ||
file.originalname?.toLowerCase().endsWith('.pdf');
if (!isPdf) {
throw new BadRequestException('Endast PDF stöds i detta steg för flyer-parsning.');
}
try {
const parsed = await pdfParse(file.buffer);
const text = parsed.text?.trim() ?? '';
if (text) return text;
warnings.push('PDF lästes men textinnehållet var tomt.');
return '';
} catch (err) {
this.logger.warn(`pdf-parse misslyckades för flyer: ${String(err)}`);
throw new BadRequestException('Kunde inte läsa PDF-underlaget.');
}
}
private parseWillysText(text: string): FlyerParseItem[] {
const lines = text
.split('\n')
.map((line) => line.replace(/\s+/g, ' ').trim())
.filter((line) => line.length > 1);
const items: FlyerParseItem[] = [];
let currentCategory: string | null = null;
for (const line of lines) {
const categoryMatch = line.match(CATEGORY_REGEX);
if (categoryMatch) {
currentCategory = this.normalizeCategory(categoryMatch[1]);
continue;
}
if (this.isIgnoredLine(line)) {
continue;
}
const parsed = this.parseProductLine(line, currentCategory);
if (parsed) {
items.push(parsed);
}
}
return this.deduplicate(items);
}
private parseProductLine(line: string, category: string | null): FlyerParseItem | null {
const segments = line.split(/[•|]/g).map((s) => s.trim()).filter(Boolean);
const source = segments.length > 0 ? segments.join(' ') : line;
const compareMatch = source.match(/j[aä]mf[oö]rpris\s*(\d{1,4}(?:[\.,:]\d{1,2})?)\s*kr\s*\/\s*([a-zåäö]+)/i);
const priceMatch = source.match(/(?:^|\s)(\d{1,4}(?:[\.,:]\d{1,2})?)\s*kr(?:\s*\/\s*([a-zåäö]+))?/i);
const offerMatch = source.match(/(max\s*\d+\s*(?:k[öo]p|f[öo]rp)\/?hush[åa]ll|l[aä]gsta\s*30-dgrspris\s*\d+[\.,:]?\d*\s*kr)/i);
const nameCandidate = this.extractNameCandidate(segments.length > 0 ? segments[0] : line);
if (!nameCandidate) {
return null;
}
const reasonCodes: string[] = [];
if (priceMatch) reasonCodes.push('price_found');
if (compareMatch) reasonCodes.push('comparison_price_found');
if (offerMatch) reasonCodes.push('offer_found');
if (segments.length > 1) reasonCodes.push('bullet_structured_line');
if (reasonCodes.length === 0) {
return null;
}
const confidence = Math.min(0.99, 0.45 + reasonCodes.length * 0.15);
return {
rawName: nameCandidate,
normalizedName: normalizeName(nameCandidate),
category,
price: priceMatch ? this.parseSvNumber(priceMatch[1]) : null,
priceUnit: priceMatch?.[2]?.toLowerCase() ?? null,
comparisonPrice: compareMatch ? this.parseSvNumber(compareMatch[1]) : null,
comparisonUnit: compareMatch?.[2]?.toLowerCase() ?? null,
offerText: offerMatch?.[1] ?? null,
confidence,
reasonCodes,
};
}
private extractNameCandidate(raw: string): string | null {
const cleaned = raw
.replace(/j[aä]mf[oö]rpris.*$/i, ' ')
.replace(/\b\d{1,4}(?:[\.,:]\d{1,2})?\s*kr(?:\s*\/\s*[a-zåäö]+)?\b/gi, ' ')
.replace(/\b(max\s*\d+\s*(?:k[öo]p|f[öo]rp)\/?hush[åa]ll|l[aä]gsta\s*30-dgrspris.*)$/i, ' ')
.replace(/\s+/g, ' ')
.trim();
if (cleaned.length < 2) return null;
if (!/[a-zåäö]/i.test(cleaned)) return null;
return cleaned;
}
private normalizeCategory(value: string): string {
const normalized = value
.toLowerCase()
.replace('kott', 'kött')
.replace('gronsaker', 'grönsaker')
.replace('brod', 'bröd');
return normalized.charAt(0).toUpperCase() + normalized.slice(1);
}
private parseSvNumber(value: string): number {
return Number.parseFloat(value.replace(':', '.').replace(',', '.'));
}
private isIgnoredLine(line: string): boolean {
const normalized = line.trim().toLowerCase();
if (normalized.length < 3) return true;
if (/^(willys|vecka|erbjudande|g[aä]ller|reservation)/i.test(normalized)) return true;
if (/^\d+\s*\/\s*\d+/.test(normalized)) return true;
return false;
}
private deduplicate(items: FlyerParseItem[]): FlyerParseItem[] {
const seen = new Set<string>();
const unique: FlyerParseItem[] = [];
for (const item of items) {
const key = `${item.normalizedName}|${item.price ?? ''}|${item.comparisonPrice ?? ''}|${item.category ?? ''}`;
if (seen.has(key)) continue;
seen.add(key);
unique.push(item);
}
return unique;
}
}
File diff suppressed because one or more lines are too long