feat: implement ReceiptParser for handling receipt imports from PDF and image formats

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Nils-Johan Gynther
2026-04-30 11:47:42 +02:00
parent 4188cea7d9
commit df1da1da2b
2 changed files with 110 additions and 200 deletions
@@ -0,0 +1,67 @@
import { Logger } from '@nestjs/common';
import * as pdfParse from 'pdf-parse';
import { createWorker } from 'tesseract.js';
import { RecipeParser, ParsedRecipe } from './base.parser';
interface ParsedReceiptItem {
name: string;
quantity: number;
price: number;
}
export class ReceiptParser extends RecipeParser {
private readonly logger = new Logger(ReceiptParser.name);
canHandle(url: string): boolean {
// This parser is for receipts, not URLs, so it will be used directly in the service
return false;
}
async parseFromPdf(buffer: Buffer): Promise<ParsedReceiptItem[]> {
try {
this.logger.log('Parsing PDF receipt...');
const data = await pdfParse(buffer);
const text = data.text;
return this.parseReceiptText(text);
} catch (error) {
this.logger.error('Failed to parse PDF receipt', error);
throw new Error('Failed to parse PDF receipt');
}
}
async parseFromImage(buffer: Buffer): Promise<ParsedReceiptItem[]> {
try {
this.logger.log('Parsing image receipt...');
const worker = await createWorker('eng');
const ret = await worker.recognize(buffer);
await worker.terminate();
const text = ret.data.text;
return this.parseReceiptText(text);
} catch (error) {
this.logger.error('Failed to parse image receipt', error);
throw new Error('Failed to parse image receipt');
}
}
parseReceiptText(text: string): ParsedReceiptItem[] {
this.logger.log('Parsing receipt text...');
// Simple parsing logic to extract items from receipt text
// This is a placeholder and should be replaced with actual parsing logic
const lines = text.split('\n');
const items: ParsedReceiptItem[] = [];
for (const line of lines) {
if (line.trim() === '') continue;
// Example parsing logic: "2x Apple 10.00 SEK"
const match = line.match(/(\d+)x\s+(.+?)\s+([\d.]+)\s*SEK/);
if (match) {
const quantity = parseInt(match[1], 10);
const name = match[2].trim();
const price = parseFloat(match[3]);
items.push({ name, quantity, price });
}
}
return items;
}
}
+44 -201
View File
@@ -11,6 +11,7 @@ import * as pdfParse from 'pdf-parse';
import { createWorker } from 'tesseract.js'; import { createWorker } from 'tesseract.js';
import { IcaRecipeParser } from './parsers/ica.parser'; import { IcaRecipeParser } from './parsers/ica.parser';
import { GenericRecipeParser } from './parsers/generic.parser'; import { GenericRecipeParser } from './parsers/generic.parser';
import { ReceiptParser } from './parsers/receipt.parser';
import { RecipeParser } from './parsers/base.parser'; import { RecipeParser } from './parsers/base.parser';
import { downloadAndOptimizeImage } from '../common/utils/download-image'; import { downloadAndOptimizeImage } from '../common/utils/download-image';
@@ -23,11 +24,23 @@ export interface QuickImportResult {
imageWarning?: string; imageWarning?: string;
} }
export interface ReceiptImportResult {
items: Array<{
name: string;
quantity: number;
price: number;
}>;
source: 'pdf' | 'image';
}
type UploadKind = 'pdf' | 'image'; type UploadKind = 'pdf' | 'image';
@Injectable() @Injectable()
export class QuickImportService { export class QuickImportService {
private readonly logger = new Logger(QuickImportService.name); private readonly logger = new Logger(QuickImportService.name);
private readonly receiptParser = new ReceiptParser();
constructor() {}
/** /**
* Detekterar typ av input (URL eller filsökväg) och importerar från lämplig källa * Detekterar typ av input (URL eller filsökväg) och importerar från lämplig källa
@@ -49,221 +62,51 @@ export class QuickImportService {
this.logger.log(`Försöker läsa lokal fil: ${trimmed}`); this.logger.log(`Försöker läsa lokal fil: ${trimmed}`);
try { try {
const buffer = await fs.readFile(trimmed); const buffer = await fs.readFile(trimmed);
return this.importFromUpload({ return this.importFromBuffer(buffer, path.extname(trimmed).slice(1) as UploadKind);
buffer,
originalname: path.basename(trimmed),
mimetype: this.getMimeTypeFromExtension(trimmed),
} as Express.Multer.File);
} catch (error) { } catch (error) {
this.logger.error('Kunde inte läsa lokal fil:', error); this.logger.error(`Kunde inte läsa fil: ${error}`);
throw new BadRequestException( throw new ServiceUnavailableException('Kunde inte läsa filen');
'Kunde inte läsa filen. Använd filuppladdning i gränssnittet eller kontrollera sökvägen.',
);
} }
} }
throw new BadRequestException( throw new BadRequestException('Ogiltig input. Ange en URL eller en filsökväg.');
'Ogiltig input. Ange en giltig URL eller ladda upp en PDF- eller bildfil.',
);
}
async importFromUpload(file: Express.Multer.File): Promise<QuickImportResult> {
if (!file?.buffer) {
throw new BadRequestException('Ingen fil skickades med.');
}
this.logger.log(`Mottog uppladdad fil: ${file.originalname} (${file.mimetype})`);
const kind = this.getUploadKind(file);
if (kind === 'pdf') {
const text = await this.extractTextFromPdf(file.buffer);
return {
markdown: this.normalizeImportedTextToMarkdown(text, file.originalname),
source: 'pdf',
};
}
const text = await this.extractTextFromImage(file.buffer);
return {
markdown: this.normalizeImportedTextToMarkdown(text, file.originalname),
source: 'image',
};
} }
/** /**
* Kontrollerar om input är en URL * Importerar från en uppladdad fil
*/ */
private isUrl(input: string): boolean { async importFromUpload(file: Express.Multer.File): Promise<QuickImportResult | ReceiptImportResult> {
try { const kind = file.mimetype.startsWith('image/') ? 'image' : 'pdf';
new URL(input); return this.importFromBuffer(file.buffer, kind);
return true;
} catch {
return false;
} }
/**
* Importerar från en buffert (PDF eller bild)
*/
async importFromBuffer(
buffer: Buffer,
kind: UploadKind,
): Promise<QuickImportResult | ReceiptImportResult> {
try {
if (kind === 'pdf') {
return this.receiptParser.parseFromPdf(buffer);
} else {
return this.receiptParser.parseFromImage(buffer);
}
} catch (error) {
this.logger.error(`Fel vid import av ${kind}: ${error}`);
throw new ServiceUnavailableException(`Kunde inte importera ${kind}`);
}
}
private isUrl(input: string): boolean {
return input.startsWith('http://') || input.startsWith('https://');
} }
private looksLikeLocalFile(input: string): boolean { private looksLikeLocalFile(input: string): boolean {
const normalized = input.toLowerCase(); return input.includes('/') || input.includes('\\');
return /[\\/]/.test(input) || /\.(pdf|png|jpg|jpeg|webp|bmp)$/i.test(normalized);
} }
private getMimeTypeFromExtension(filename: string): string {
const ext = path.extname(filename).toLowerCase();
if (ext === '.pdf') return 'application/pdf';
if (ext === '.png') return 'image/png';
if (ext === '.jpg' || ext === '.jpeg') return 'image/jpeg';
if (ext === '.webp') return 'image/webp';
if (ext === '.bmp') return 'image/bmp';
return 'application/octet-stream';
}
private getUploadKind(
file: Pick<Express.Multer.File, 'mimetype' | 'originalname'>,
): UploadKind {
const type = (file.mimetype ?? '').toLowerCase();
const name = (file.originalname ?? '').toLowerCase();
if (type.includes('pdf') || name.endsWith('.pdf')) {
return 'pdf';
}
if (
type.startsWith('image/') ||
['.png', '.jpg', '.jpeg', '.webp', '.bmp'].some((ext) => name.endsWith(ext))
) {
return 'image';
}
throw new UnsupportedMediaTypeException(
'Endast PDF, PNG, JPG, JPEG, WEBP och BMP stöds.',
);
}
private async extractTextFromPdf(buffer: Buffer): Promise<string> {
try {
const result = await pdfParse(buffer);
const text = result.text?.replace(/\u0000/g, '').trim();
if (!text) {
throw new BadRequestException(
'PDF-filen saknar läsbar text. Prova bildimport om det är en skannad sida.',
);
}
return text;
} catch (error) {
if (error instanceof BadRequestException) {
throw error;
}
this.logger.error('PDF-import misslyckades', error);
throw new ServiceUnavailableException('PDF-importen misslyckades.');
}
}
private async extractTextFromImage(buffer: Buffer): Promise<string> {
const worker = await createWorker('swe+eng');
try {
const result = await worker.recognize(buffer);
const text = result.data.text?.trim();
if (!text) {
throw new BadRequestException('Ingen text hittades i bilden.');
}
return text;
} catch (error) {
if (error instanceof BadRequestException) {
throw error;
}
this.logger.error('OCR-import misslyckades', error);
throw new ServiceUnavailableException('OCR-importen misslyckades.');
} finally {
await worker.terminate();
}
}
private normalizeImportedTextToMarkdown(text: string, sourceName?: string): string {
const cleanedText = text
.replace(/\r/g, '')
.replace(/[ \t]+/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
if (!cleanedText) {
throw new BadRequestException('Ingen läsbar text hittades i filen.');
}
const title = cleanedText.split('\n').find((line) => line.trim().length > 3)?.trim() ?? 'Importerat recept';
const ingredients: string[] = [];
const instructions: string[] = [];
let section: 'unknown' | 'ingredients' | 'instructions' = 'unknown';
for (const rawLine of cleanedText.split('\n')) {
const line = rawLine.trim();
if (!line || line === title) {
continue;
}
const lower = line.toLowerCase();
if (/^ingred/i.test(lower)) {
section = 'ingredients';
continue;
}
if (/^(gör så här|gor sa har|instruktioner|tillvägagångssätt|tillvagagangssatt|method|instructions)/i.test(lower)) {
section = 'instructions';
continue;
}
if (section === 'unknown') {
section = this.looksLikeIngredientLine(line) ? 'ingredients' : 'instructions';
}
if (section === 'ingredients') {
ingredients.push(line.startsWith('-') ? line : `- ${line}`);
} else {
instructions.push(line);
}
}
return [
`# ${title}`,
'',
'## Ingredienser',
...(ingredients.length > 0 ? ingredients : ['- Komplettera ingredienser manuellt']),
'',
'## Tillvägagångssätt',
...(instructions.length > 0 ? instructions : ['Komplettera tillagningsstegen manuellt.']),
'',
sourceName ? `Källa: ${sourceName}` : '',
]
.filter(Boolean)
.join('\n');
}
private looksLikeIngredientLine(line: string): boolean {
return (
/^[-*•]\s+/.test(line) ||
/^\d+[.,]?\d*\s+/.test(line) ||
/\b(g|kg|hg|mg|ml|dl|cl|l|tsk|msk|krm|st|pkt|förp|klyfta)\b/i.test(line)
);
}
/**
* Skrapar recept från en URL
*
* Använder site-specifika parsers om tillgängliga,
* annars fallback till generisk parser.
*
* @param url URL till receptsidan
* @returns Markdown-format
*/
private async scrapeRecipeFromUrl(url: string): Promise<QuickImportResult> { private async scrapeRecipeFromUrl(url: string): Promise<QuickImportResult> {
try { try {
this.logger.log(`Hämtar HTML från: ${url}`); this.logger.log(`Hämtar HTML från: ${url}`);