- Added new FlyerParsingModule to the application - Updated AppModule to import the new FlyerParsingModule - Added new directory structure for flyer-parsing module
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
import { Module, Controller, Get } from '@nestjs/common';
|
||||
import { WebScrapingModule } from './web-scraping-service/web-scraping.module';
|
||||
import { RecipesModule } from './recipes/recipes.module';
|
||||
import { DocumentServiceModule } from './document-service/document-service.module';
|
||||
import { ReceiptParsingModule } from './receipt-parsing/receipt-parsing.module';
|
||||
import { Module, Controller, Get } from '@nestjs/common';
|
||||
import { WebScrapingModule } from './web-scraping-service/web-scraping.module';
|
||||
import { RecipesModule } from './recipes/recipes.module';
|
||||
import { DocumentServiceModule } from './document-service/document-service.module';
|
||||
import { ReceiptParsingModule } from './receipt-parsing/receipt-parsing.module';
|
||||
import { FlyerParsingModule } from './flyer-parsing/flyer-parsing.module';
|
||||
|
||||
@Controller('health')
|
||||
class HealthController {
|
||||
@@ -15,10 +16,11 @@ class HealthController {
|
||||
@Module({
|
||||
imports: [
|
||||
DocumentServiceModule,
|
||||
WebScrapingModule,
|
||||
RecipesModule,
|
||||
ReceiptParsingModule,
|
||||
],
|
||||
WebScrapingModule,
|
||||
RecipesModule,
|
||||
ReceiptParsingModule,
|
||||
FlyerParsingModule,
|
||||
],
|
||||
controllers: [HealthController],
|
||||
})
|
||||
export class AppModule {}
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
import { IsIn, IsOptional, IsString, MinLength } from 'class-validator';
|
||||
|
||||
export class ParseFlyerDto {
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
@MinLength(20)
|
||||
text?: string;
|
||||
|
||||
@IsOptional()
|
||||
@IsString()
|
||||
@IsIn(['willys'])
|
||||
retailer?: 'willys';
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
import {
|
||||
BadRequestException,
|
||||
Body,
|
||||
Controller,
|
||||
HttpCode,
|
||||
Post,
|
||||
UploadedFile,
|
||||
UseInterceptors,
|
||||
} from '@nestjs/common';
|
||||
import { FileInterceptor } from '@nestjs/platform-express';
|
||||
import { memoryStorage } from 'multer';
|
||||
import { ParseFlyerDto } from './dto/parse-flyer.dto';
|
||||
import { FlyerParseResponse, FlyerParsingService } from './flyer-parsing.service';
|
||||
|
||||
const ALLOWED_UPLOAD_MIMES = new Set([
|
||||
'application/pdf',
|
||||
'application/octet-stream',
|
||||
'text/plain',
|
||||
]);
|
||||
|
||||
@Controller('flyer')
|
||||
export class FlyerParsingController {
|
||||
constructor(private readonly flyerParsingService: FlyerParsingService) {}
|
||||
|
||||
@Post('parse')
|
||||
@HttpCode(200)
|
||||
@UseInterceptors(
|
||||
FileInterceptor('file', {
|
||||
storage: memoryStorage(),
|
||||
limits: { fileSize: 15 * 1024 * 1024 },
|
||||
fileFilter: (_req, file, cb) => {
|
||||
if (ALLOWED_UPLOAD_MIMES.has(file.mimetype)) {
|
||||
cb(null, true);
|
||||
return;
|
||||
}
|
||||
cb(
|
||||
new BadRequestException('Otillåten filtyp för flyer-parser. Stöd: PDF eller textfil.'),
|
||||
false,
|
||||
);
|
||||
},
|
||||
}),
|
||||
)
|
||||
async parseFlyer(
|
||||
@Body() body: ParseFlyerDto,
|
||||
@UploadedFile() file?: Express.Multer.File,
|
||||
): Promise<FlyerParseResponse> {
|
||||
if (!file && !body?.text?.trim()) {
|
||||
throw new BadRequestException('Skicka antingen fil under "file" eller text i body.text.');
|
||||
}
|
||||
|
||||
const text = body?.text?.trim();
|
||||
return this.flyerParsingService.parseFlyer({
|
||||
file,
|
||||
text,
|
||||
retailer: body?.retailer ?? 'willys',
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
import { Module } from '@nestjs/common';
|
||||
import { FlyerParsingController } from './flyer-parsing.controller';
|
||||
import { FlyerParsingService } from './flyer-parsing.service';
|
||||
|
||||
@Module({
|
||||
controllers: [FlyerParsingController],
|
||||
providers: [FlyerParsingService],
|
||||
})
|
||||
export class FlyerParsingModule {}
|
||||
@@ -0,0 +1,193 @@
|
||||
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
|
||||
const pdfParse = require('pdf-parse') as (buffer: Buffer) => Promise<{ text: string }>;
|
||||
import { normalizeName } from '../common/utils/normalize-name';
|
||||
|
||||
const CATEGORY_REGEX = /^(Fisk|Kott|Kött|Mejeri|Gronsaker|Grönsaker|Frukt|Dryck|Brod|Bröd|Pasta|Ris)\b/i;
|
||||
|
||||
export type FlyerParseItem = {
|
||||
rawName: string;
|
||||
normalizedName: string;
|
||||
category: string | null;
|
||||
price: number | null;
|
||||
priceUnit: string | null;
|
||||
comparisonPrice: number | null;
|
||||
comparisonUnit: string | null;
|
||||
offerText: string | null;
|
||||
confidence: number;
|
||||
reasonCodes: string[];
|
||||
};
|
||||
|
||||
export type FlyerParseResponse = {
|
||||
retailer: 'willys';
|
||||
parserVersion: 'v1';
|
||||
items: FlyerParseItem[];
|
||||
warnings: string[];
|
||||
};
|
||||
|
||||
@Injectable()
|
||||
export class FlyerParsingService {
|
||||
private readonly logger = new Logger(FlyerParsingService.name);
|
||||
|
||||
async parseFlyer(args: {
|
||||
file?: Express.Multer.File;
|
||||
text?: string;
|
||||
retailer?: 'willys';
|
||||
}): Promise<FlyerParseResponse> {
|
||||
const retailer = args.retailer ?? 'willys';
|
||||
const warnings: string[] = [];
|
||||
const text = args.text?.trim() || (args.file ? await this.extractTextFromFile(args.file, warnings) : '');
|
||||
|
||||
if (!text) {
|
||||
throw new BadRequestException('Ingen text kunde extraheras från underlaget.');
|
||||
}
|
||||
|
||||
const items = this.parseWillysText(text);
|
||||
if (items.length === 0) {
|
||||
warnings.push('Inga produkter kunde tolkas. Kontrollera PDF-kvalitet eller textformat.');
|
||||
}
|
||||
|
||||
return {
|
||||
retailer,
|
||||
parserVersion: 'v1',
|
||||
items,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
private async extractTextFromFile(file: Express.Multer.File, warnings: string[]): Promise<string> {
|
||||
const isPdf =
|
||||
file.mimetype === 'application/pdf' ||
|
||||
file.mimetype === 'application/octet-stream' ||
|
||||
file.originalname?.toLowerCase().endsWith('.pdf');
|
||||
|
||||
if (!isPdf) {
|
||||
throw new BadRequestException('Endast PDF stöds i detta steg för flyer-parsning.');
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = await pdfParse(file.buffer);
|
||||
const text = parsed.text?.trim() ?? '';
|
||||
if (text) return text;
|
||||
warnings.push('PDF lästes men textinnehållet var tomt.');
|
||||
return '';
|
||||
} catch (err) {
|
||||
this.logger.warn(`pdf-parse misslyckades för flyer: ${String(err)}`);
|
||||
throw new BadRequestException('Kunde inte läsa PDF-underlaget.');
|
||||
}
|
||||
}
|
||||
|
||||
private parseWillysText(text: string): FlyerParseItem[] {
|
||||
const lines = text
|
||||
.split('\n')
|
||||
.map((line) => line.replace(/\s+/g, ' ').trim())
|
||||
.filter((line) => line.length > 1);
|
||||
|
||||
const items: FlyerParseItem[] = [];
|
||||
let currentCategory: string | null = null;
|
||||
|
||||
for (const line of lines) {
|
||||
const categoryMatch = line.match(CATEGORY_REGEX);
|
||||
if (categoryMatch) {
|
||||
currentCategory = this.normalizeCategory(categoryMatch[1]);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (this.isIgnoredLine(line)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parsed = this.parseProductLine(line, currentCategory);
|
||||
if (parsed) {
|
||||
items.push(parsed);
|
||||
}
|
||||
}
|
||||
|
||||
return this.deduplicate(items);
|
||||
}
|
||||
|
||||
private parseProductLine(line: string, category: string | null): FlyerParseItem | null {
|
||||
const segments = line.split(/[•|]/g).map((s) => s.trim()).filter(Boolean);
|
||||
const source = segments.length > 0 ? segments.join(' ') : line;
|
||||
|
||||
const compareMatch = source.match(/j[aä]mf[oö]rpris\s*(\d{1,4}(?:[\.,:]\d{1,2})?)\s*kr\s*\/\s*([a-zåäö]+)/i);
|
||||
const priceMatch = source.match(/(?:^|\s)(\d{1,4}(?:[\.,:]\d{1,2})?)\s*kr(?:\s*\/\s*([a-zåäö]+))?/i);
|
||||
const offerMatch = source.match(/(max\s*\d+\s*(?:k[öo]p|f[öo]rp)\/?hush[åa]ll|l[aä]gsta\s*30-dgrspris\s*\d+[\.,:]?\d*\s*kr)/i);
|
||||
|
||||
const nameCandidate = this.extractNameCandidate(segments.length > 0 ? segments[0] : line);
|
||||
if (!nameCandidate) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const reasonCodes: string[] = [];
|
||||
if (priceMatch) reasonCodes.push('price_found');
|
||||
if (compareMatch) reasonCodes.push('comparison_price_found');
|
||||
if (offerMatch) reasonCodes.push('offer_found');
|
||||
if (segments.length > 1) reasonCodes.push('bullet_structured_line');
|
||||
|
||||
if (reasonCodes.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const confidence = Math.min(0.99, 0.45 + reasonCodes.length * 0.15);
|
||||
|
||||
return {
|
||||
rawName: nameCandidate,
|
||||
normalizedName: normalizeName(nameCandidate),
|
||||
category,
|
||||
price: priceMatch ? this.parseSvNumber(priceMatch[1]) : null,
|
||||
priceUnit: priceMatch?.[2]?.toLowerCase() ?? null,
|
||||
comparisonPrice: compareMatch ? this.parseSvNumber(compareMatch[1]) : null,
|
||||
comparisonUnit: compareMatch?.[2]?.toLowerCase() ?? null,
|
||||
offerText: offerMatch?.[1] ?? null,
|
||||
confidence,
|
||||
reasonCodes,
|
||||
};
|
||||
}
|
||||
|
||||
private extractNameCandidate(raw: string): string | null {
|
||||
const cleaned = raw
|
||||
.replace(/j[aä]mf[oö]rpris.*$/i, ' ')
|
||||
.replace(/\b\d{1,4}(?:[\.,:]\d{1,2})?\s*kr(?:\s*\/\s*[a-zåäö]+)?\b/gi, ' ')
|
||||
.replace(/\b(max\s*\d+\s*(?:k[öo]p|f[öo]rp)\/?hush[åa]ll|l[aä]gsta\s*30-dgrspris.*)$/i, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
|
||||
if (cleaned.length < 2) return null;
|
||||
if (!/[a-zåäö]/i.test(cleaned)) return null;
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
private normalizeCategory(value: string): string {
|
||||
const normalized = value
|
||||
.toLowerCase()
|
||||
.replace('kott', 'kött')
|
||||
.replace('gronsaker', 'grönsaker')
|
||||
.replace('brod', 'bröd');
|
||||
return normalized.charAt(0).toUpperCase() + normalized.slice(1);
|
||||
}
|
||||
|
||||
private parseSvNumber(value: string): number {
|
||||
return Number.parseFloat(value.replace(':', '.').replace(',', '.'));
|
||||
}
|
||||
|
||||
private isIgnoredLine(line: string): boolean {
|
||||
const normalized = line.trim().toLowerCase();
|
||||
if (normalized.length < 3) return true;
|
||||
if (/^(willys|vecka|erbjudande|g[aä]ller|reservation)/i.test(normalized)) return true;
|
||||
if (/^\d+\s*\/\s*\d+/.test(normalized)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private deduplicate(items: FlyerParseItem[]): FlyerParseItem[] {
|
||||
const seen = new Set<string>();
|
||||
const unique: FlyerParseItem[] = [];
|
||||
for (const item of items) {
|
||||
const key = `${item.normalizedName}|${item.price ?? ''}|${item.comparisonPrice ?? ''}|${item.category ?? ''}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
unique.push(item);
|
||||
}
|
||||
return unique;
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user