refactor: remove unused parser files and update ParsedReceiptItem model with additional fields
This commit is contained in:
@@ -25,11 +25,9 @@
|
||||
"class-transformer": "^0.5.1",
|
||||
"class-validator": "^0.15.1",
|
||||
"multer": "^1.4.5-lts.2",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"reflect-metadata": "^0.2.2",
|
||||
"rxjs": "^7.8.1",
|
||||
"sharp": "^0.33.5",
|
||||
"tesseract.js": "^6.0.1",
|
||||
"uuid": "^11.1.0",
|
||||
"helmet": "^8.0.0",
|
||||
"@nestjs/throttler": "^6.4.0"
|
||||
@@ -43,7 +41,6 @@
|
||||
"@types/multer": "^1.4.12",
|
||||
"@types/node": "^22.15.29",
|
||||
"@types/passport-jwt": "^4.0.1",
|
||||
"@types/pdf-parse": "^1.1.5",
|
||||
"@types/uuid": "^10.0.0",
|
||||
"prisma": "6.12.0",
|
||||
"typescript": "^5.4.5",
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
# Site-Specifika Parsers
|
||||
|
||||
Denna mapp innehåller parsers för olika receptsidor. Varje webbplats kan ha sina egna selectors och datastrukturer.
|
||||
|
||||
## Arkitektur
|
||||
|
||||
- **`base.parser.ts`** – Bas-klass som alla parsers extendar
|
||||
- **`ica.parser.ts`** – Optimerad parser för ica.se
|
||||
- **`generic.parser.ts`** – Generisk fallback-parser för okända sidor
|
||||
|
||||
## Så lägger du till en ny parser
|
||||
|
||||
1. Skapa en ny fil, t.ex. `mathem.parser.ts`:
|
||||
|
||||
```typescript
|
||||
import { RecipeParser, ParsedRecipe } from './base.parser';
|
||||
|
||||
export class MathemsRecipeParser extends RecipeParser {
|
||||
canHandle(url: string): boolean {
|
||||
return /mathem\.se/i.test(url); // Matchar bara mathem.se-URLs
|
||||
}
|
||||
|
||||
parse(html: string): ParsedRecipe {
|
||||
// Din site-specifika parsing-logik här
|
||||
// Returnera { name, ingredients, instructions }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
2. Registrera parsern i `quick-import.service.ts`:
|
||||
|
||||
```typescript
|
||||
const parsers: RecipeParser[] = [
|
||||
new IcaRecipeParser(),
|
||||
new MathemsRecipeParser(), // Din nya parser här
|
||||
new GenericRecipeParser(), // Måste vara sist (fallback)
|
||||
];
|
||||
```
|
||||
|
||||
## Bästa praxis
|
||||
|
||||
- **I18n**: Använd svenska användarmeddelanden
|
||||
- **Säkerhet**: Sanitera HTML-output innan du använder det
|
||||
- **Robustness**: Testa edge-cases (tomma ingredienser, långa instruktioner)
|
||||
- **Prioritering**: Mer specifika parsers måste komma före generiska
|
||||
|
||||
## Tips för att debugga en ny sida
|
||||
|
||||
1. Kolla om sidan använder JSON-LD: Öppna DevTools → Sök efter `<script type="application/ld+json">`
|
||||
2. Om JSON-LD finns → kopiera strukturen och anpassa `parseIngredientLine()`
|
||||
3. Om inte → analysera HTML-strukturen och justera CSS-selectors
|
||||
|
||||
## Framtida förbättringar
|
||||
|
||||
- [ ] Stöd för Puppeteer/Playwright för JavaScript-heavy webbplatser
|
||||
- [ ] Plugin-system för community-bidrag
|
||||
- [ ] Tester per parser
|
||||
@@ -1,107 +0,0 @@
|
||||
import { RecipeParser, ParsedRecipe } from './base.parser';
|
||||
|
||||
// Konkret testklass för att komma åt protected-metoden
|
||||
class TestParser extends RecipeParser {
|
||||
canHandle(_url: string): boolean { return true; }
|
||||
parse(_html: string): ParsedRecipe {
|
||||
return { name: '', ingredients: [] };
|
||||
}
|
||||
public testParseIngredientLine(line: string) {
|
||||
return this.parseIngredientLine(line);
|
||||
}
|
||||
}
|
||||
|
||||
describe('RecipeParser.parseIngredientLine', () => {
|
||||
const parser = new TestParser();
|
||||
const parse = (line: string) => parser.testParseIngredientLine(line);
|
||||
|
||||
describe('enkla mängd + enhet + namn', () => {
|
||||
it('parsar "150 g lax"', () => {
|
||||
const result = parse('150 g lax');
|
||||
expect(result?.quantity).toBe(150);
|
||||
expect(result?.unit).toBe('g');
|
||||
expect(result?.name).toBe('lax');
|
||||
});
|
||||
|
||||
it('parsar "2 dl grädde"', () => {
|
||||
const result = parse('2 dl grädde');
|
||||
expect(result?.quantity).toBe(2);
|
||||
expect(result?.unit).toBe('dl');
|
||||
expect(result?.name).toBe('grädde');
|
||||
});
|
||||
|
||||
it('parsar "1 msk olivolja"', () => {
|
||||
const result = parse('1 msk olivolja');
|
||||
expect(result?.quantity).toBe(1);
|
||||
expect(result?.unit).toBe('msk');
|
||||
expect(result?.name).toBe('olivolja');
|
||||
});
|
||||
|
||||
it('parsar "3 st ägg"', () => {
|
||||
const result = parse('3 st ägg');
|
||||
expect(result?.quantity).toBe(3);
|
||||
expect(result?.unit).toBe('st');
|
||||
expect(result?.name).toBe('ägg');
|
||||
});
|
||||
|
||||
it('parsar "3 ägg" (utan enhet)', () => {
|
||||
const result = parse('3 ägg');
|
||||
expect(result?.quantity).toBe(3);
|
||||
expect(result?.unit).toBe('');
|
||||
expect(result?.name).toBe('ägg');
|
||||
});
|
||||
});
|
||||
|
||||
describe('bråktal', () => {
|
||||
it('parsar "1/2 citron"', () => {
|
||||
const result = parse('1/2 citron');
|
||||
expect(result?.quantity).toBeCloseTo(0.5);
|
||||
expect(result?.name).toBe('citron');
|
||||
});
|
||||
|
||||
it('parsar "1 1/2 dl mjölk"', () => {
|
||||
const result = parse('1 1/2 dl mjölk');
|
||||
expect(result?.quantity).toBeCloseTo(1.5);
|
||||
expect(result?.unit).toBe('dl');
|
||||
});
|
||||
});
|
||||
|
||||
describe('utan mängd', () => {
|
||||
it('parsar "salt och peppar" (ingen mängd)', () => {
|
||||
const result = parse('salt och peppar');
|
||||
expect(result?.quantity).toBe(0);
|
||||
expect(result?.unit).toBe('');
|
||||
expect(result?.name).toBe('salt och peppar');
|
||||
});
|
||||
|
||||
it('returnerar null för tom sträng', () => {
|
||||
expect(parse('')).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('med parenteser', () => {
|
||||
it('parsar "1 förp handskalade räkor (à 570 g)" med note', () => {
|
||||
const result = parse('1 förp handskalade räkor (à 570 g)');
|
||||
expect(result?.quantity).toBe(1);
|
||||
expect(result?.unit).toBe('förp');
|
||||
expect(result?.name).toBe('handskalade räkor');
|
||||
expect(result?.note).toBe('à 570 g');
|
||||
});
|
||||
});
|
||||
|
||||
describe('kommatalstal', () => {
|
||||
it('parsar "2,5 dl buljong"', () => {
|
||||
const result = parse('2,5 dl buljong');
|
||||
expect(result?.quantity).toBeCloseTo(2.5);
|
||||
expect(result?.unit).toBe('dl');
|
||||
});
|
||||
});
|
||||
|
||||
describe('strips HTML-taggar', () => {
|
||||
it('parsar rad med HTML', () => {
|
||||
const result = parse('<b>200</b> g köttfärs');
|
||||
expect(result?.quantity).toBe(200);
|
||||
expect(result?.unit).toBe('g');
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,159 +0,0 @@
|
||||
/**
|
||||
* Bas-parser för receptsidor
|
||||
* Alla site-specifika parsers bör extenda denna
|
||||
*/
|
||||
export interface ParsedRecipe {
|
||||
name: string;
|
||||
description?: string;
|
||||
ingredients: Array<{
|
||||
quantity: number;
|
||||
unit: string;
|
||||
name: string;
|
||||
note?: string;
|
||||
}>;
|
||||
instructions?: string;
|
||||
imageUrl?: string;
|
||||
}
|
||||
|
||||
export abstract class RecipeParser {
|
||||
/**
|
||||
* Kontrollera om denna parser kan hantera denna URL
|
||||
*/
|
||||
abstract canHandle(url: string): boolean;
|
||||
|
||||
/**
|
||||
* Parsa HTML och extrahera receptdata
|
||||
*/
|
||||
abstract parse(html: string): ParsedRecipe;
|
||||
|
||||
/**
|
||||
* Hjälpfunktion: parsa ingrediens-rad
|
||||
* Hanterar format som:
|
||||
* - "3 ägg"
|
||||
* - "150 g lax"
|
||||
* - "1/2 citron"
|
||||
* - "1 msk senap"
|
||||
* - "salt och peppar"
|
||||
* - "1 förp handskalade räkor i lake (à 570 g)"
|
||||
*/
|
||||
protected parseIngredientLine(line: string): {
|
||||
quantity: number;
|
||||
unit: string;
|
||||
name: string;
|
||||
note?: string;
|
||||
} | null {
|
||||
let cleaned = line.replace(/<[^>]+>/g, '').trim();
|
||||
if (!cleaned) return null;
|
||||
|
||||
// Kända enheter
|
||||
const knownUnits = [
|
||||
'g', 'kg', 'hg', 'mg', 'ml', 'dl', 'l', 'tl',
|
||||
'st', 'tsk', 'msk', 'krm', 'matsked', 'tesked',
|
||||
'pris', 'portion', 'port', 'burk', 'förp', 'paket', 'efter smak', 'klyfta',
|
||||
];
|
||||
|
||||
// Extrahera parentetisk info
|
||||
let parentheticalText = '';
|
||||
const parentheteMatch = cleaned.match(/\s*\(([^)]*)\)/);
|
||||
if (parentheteMatch) {
|
||||
parentheticalText = parentheteMatch[1].trim();
|
||||
cleaned = cleaned.replace(/\s*\([^)]*\)/, '').trim();
|
||||
}
|
||||
|
||||
// Hantera bråkdelar: "1/2" eller "1 1/2" eller "1 1 / 2"
|
||||
// Regex: (optional whole)? numerator / denominator
|
||||
const fractionMatch = cleaned.match(/^(\d+)?\s*(\d+)\s*\/\s*([\d.]+)/);
|
||||
let quantity = 0;
|
||||
let remainingText = cleaned;
|
||||
|
||||
if (fractionMatch) {
|
||||
if (fractionMatch[1]) {
|
||||
// Heltal + bråk: "1 1/2"
|
||||
const whole = parseFloat(fractionMatch[1]);
|
||||
const numerator = parseFloat(fractionMatch[2]);
|
||||
const denominator = parseFloat(fractionMatch[3]);
|
||||
quantity = whole + (numerator / denominator);
|
||||
} else {
|
||||
// Bara bråk: "1/2"
|
||||
const numerator = parseFloat(fractionMatch[2]);
|
||||
const denominator = parseFloat(fractionMatch[3]);
|
||||
quantity = numerator / denominator;
|
||||
}
|
||||
remainingText = cleaned.substring(fractionMatch[0].length).trim();
|
||||
} else {
|
||||
const numberMatch = remainingText.match(/^([\d.,]+)/);
|
||||
if (numberMatch) {
|
||||
quantity = parseFloat(numberMatch[1].replace(',', '.'));
|
||||
remainingText = remainingText.substring(numberMatch[0].length).trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Extrahera potentiell enhet
|
||||
let potentialUnit = '';
|
||||
let productName = remainingText;
|
||||
|
||||
if (remainingText) {
|
||||
const unitMatch = remainingText.match(/^([a-zåäö]+)\b/i);
|
||||
if (unitMatch) {
|
||||
const candidateUnit = unitMatch[1].toLowerCase();
|
||||
if (knownUnits.includes(candidateUnit)) {
|
||||
potentialUnit = candidateUnit;
|
||||
productName = remainingText.substring(candidateUnit.length).trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Analysera parenthetical text för måttenhet
|
||||
let parenthHasUnit = false;
|
||||
if (parentheticalText) {
|
||||
for (const unit of knownUnits) {
|
||||
if (parentheticalText.toLowerCase().includes(unit)) {
|
||||
parenthHasUnit = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let note: string | undefined = undefined;
|
||||
|
||||
// Om vi hade quantity i huvuddelen och parenthetical innehåller unit
|
||||
// → spara parenthetical som note
|
||||
if (quantity > 0 && parenthHasUnit) {
|
||||
note = parentheticalText;
|
||||
}
|
||||
|
||||
// Om ingen mängd i huvuddelen men parenthetical hade både mängd och unit
|
||||
// → parse parenthetical som quantity + unit
|
||||
if (quantity === 0 && parentheticalText) {
|
||||
const parenthMatch = parentheticalText.match(/^[\D]*?([\d.,]+)?\s*([a-zåäö]*)?\s*(.*)$/i);
|
||||
if (parenthMatch) {
|
||||
let pQuantity = parenthMatch[1] ? parseFloat(parenthMatch[1].replace(',', '.')) : 0;
|
||||
let pUnit = parenthMatch[2]?.toLowerCase() || '';
|
||||
let pRest = parenthMatch[3]?.trim() || '';
|
||||
|
||||
if (knownUnits.includes(pUnit) && pQuantity > 0) {
|
||||
quantity = pQuantity;
|
||||
potentialUnit = pUnit;
|
||||
note = parentheticalText;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Om ingen mängd och enhet, bara returna produktnamnet
|
||||
if (quantity === 0) {
|
||||
return {
|
||||
quantity: 0,
|
||||
unit: '',
|
||||
name: cleaned,
|
||||
note: parentheticalText || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
quantity,
|
||||
unit: potentialUnit,
|
||||
name: productName,
|
||||
note: note,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -1,231 +0,0 @@
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { RecipeParser, ParsedRecipe } from './base.parser';
|
||||
|
||||
/**
|
||||
* Generisk parser för okända receptsidor
|
||||
* Försöker JSON-LD först, sedan vanlig HTML-parsing
|
||||
* Denna är mer permissiv än site-specifika parsers
|
||||
*/
|
||||
export class GenericRecipeParser extends RecipeParser {
|
||||
private readonly logger = new Logger(GenericRecipeParser.name);
|
||||
canHandle(url: string): boolean {
|
||||
// Denna parser hanterar alltid (är fallback)
|
||||
return true;
|
||||
}
|
||||
|
||||
parse(html: string): ParsedRecipe {
|
||||
this.logger.log('Parsing recipe from unknown site...');
|
||||
|
||||
// Extrahera og:image för bildurl-fallback
|
||||
const ogImage = this.extractOgImage(html);
|
||||
|
||||
// Försöka extrahera JSON-LD recipe data (flera script-taggar är vanligt)
|
||||
const jsonLdRegex =
|
||||
/<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi;
|
||||
let jsonLdMatch: RegExpExecArray | null;
|
||||
while ((jsonLdMatch = jsonLdRegex.exec(html)) !== null) {
|
||||
const rawJson = jsonLdMatch[1]?.trim();
|
||||
if (!rawJson) continue;
|
||||
|
||||
try {
|
||||
const parsedJson = JSON.parse(rawJson);
|
||||
const recipe = this.findRecipeInJsonLd(parsedJson);
|
||||
if (recipe) {
|
||||
this.logger.log('JSON-LD data found');
|
||||
return this.extractFromJsonLd(recipe, ogImage);
|
||||
}
|
||||
} catch {
|
||||
this.logger.warn('JSON-LD parsing failed');
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log('No JSON-LD found, using HTML parsing');
|
||||
return this.parseFromHtml(html, ogImage);
|
||||
}
|
||||
|
||||
private findRecipeInJsonLd(jsonData: any): any {
|
||||
if (!jsonData) return null;
|
||||
|
||||
if (Array.isArray(jsonData)) {
|
||||
for (const item of jsonData) {
|
||||
const recipe = this.findRecipeInJsonLd(item);
|
||||
if (recipe) return recipe;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (jsonData['@type'] === 'Recipe') {
|
||||
return jsonData;
|
||||
}
|
||||
|
||||
if (Array.isArray(jsonData['@type']) && jsonData['@type'].includes('Recipe')) {
|
||||
return jsonData;
|
||||
}
|
||||
|
||||
const graph = jsonData['@graph'];
|
||||
if (Array.isArray(graph)) {
|
||||
return graph.find(
|
||||
(item: any) =>
|
||||
item?.['@type'] === 'Recipe' ||
|
||||
(Array.isArray(item?.['@type']) && item['@type'].includes('Recipe')),
|
||||
) ?? null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractOgImage(html: string): string | undefined {
|
||||
const match = html.match(/<meta[^>]+property="og:image"[^>]+content="([^"]+)"/i)
|
||||
|| html.match(/<meta[^>]+content="([^"]+)"[^>]+property="og:image"/i);
|
||||
return match ? this.decodeHtmlEntities(match[1].trim()) : undefined;
|
||||
}
|
||||
|
||||
private decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>');
|
||||
}
|
||||
|
||||
private extractImageValue(image: any): string | undefined {
|
||||
if (!image) return undefined;
|
||||
if (typeof image === 'string') return image;
|
||||
|
||||
if (Array.isArray(image)) {
|
||||
for (const item of image) {
|
||||
const extracted = this.extractImageValue(item);
|
||||
if (extracted) return extracted;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (typeof image === 'object') {
|
||||
return (
|
||||
image.url ||
|
||||
image['@id'] ||
|
||||
image.contentUrl ||
|
||||
image.thumbnailUrl ||
|
||||
this.extractImageValue(image.image)
|
||||
);
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private extractFromJsonLd(recipe: any, ogImage?: string): ParsedRecipe {
|
||||
const name = recipe.name || '';
|
||||
const description = recipe.description || '';
|
||||
|
||||
// Extrahera bildurl från JSON-LD
|
||||
let imageUrl: string | undefined = ogImage;
|
||||
const extractedImage = this.extractImageValue(recipe.image);
|
||||
if (extractedImage) {
|
||||
imageUrl = this.decodeHtmlEntities(extractedImage);
|
||||
}
|
||||
|
||||
const ingredients: Array<{ quantity: number; unit: string; name: string; note?: string }> = [];
|
||||
if (recipe.recipeIngredient && Array.isArray(recipe.recipeIngredient)) {
|
||||
for (const ing of recipe.recipeIngredient) {
|
||||
const parsed = this.parseIngredientLine(ing);
|
||||
if (parsed) {
|
||||
ingredients.push(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let instructions = '';
|
||||
if (recipe.recipeInstructions) {
|
||||
if (typeof recipe.recipeInstructions === 'string') {
|
||||
instructions = recipe.recipeInstructions;
|
||||
} else if (Array.isArray(recipe.recipeInstructions)) {
|
||||
instructions = recipe.recipeInstructions
|
||||
.map((step: any) => {
|
||||
if (typeof step === 'string') return step;
|
||||
if (step.text) return step.text;
|
||||
return '';
|
||||
})
|
||||
.filter((s: string) => s)
|
||||
.join('\n\n');
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name,
|
||||
description,
|
||||
ingredients,
|
||||
instructions,
|
||||
imageUrl,
|
||||
};
|
||||
}
|
||||
|
||||
private parseFromHtml(html: string, ogImage?: string): ParsedRecipe {
|
||||
// Försöka hitta titel
|
||||
let name = '';
|
||||
|
||||
// Prova olika selector-mönster
|
||||
let titleMatch =
|
||||
html.match(/<h1[^>]*>([^<]+)<\/h1>/i) ||
|
||||
html.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i) ||
|
||||
html.match(/<title>([^<]+)<\/title>/i);
|
||||
|
||||
if (titleMatch) {
|
||||
name = titleMatch[1].trim();
|
||||
}
|
||||
|
||||
// Försöka extrahera beskrivning från meta-taggar
|
||||
let description = '';
|
||||
const descMatch = html.match(
|
||||
/<meta\s+name="description"\s+content="([^"]+)"/i
|
||||
);
|
||||
if (descMatch) {
|
||||
description = descMatch[1].trim();
|
||||
}
|
||||
|
||||
// Försöka extrahera ingredienser från vanliga strukturer
|
||||
const ingredients: Array<{ quantity: number; unit: string; name: string; note?: string }> = [];
|
||||
|
||||
// Testa olika ingredient-selectors
|
||||
const ingredientPatterns = [
|
||||
/<li[^>]*>(.*?)<\/li>/gi,
|
||||
/<div[^>]*class="ingredient"[^>]*>(.*?)<\/div>/gi,
|
||||
/<p[^>]*class="ingredient"[^>]*>(.*?)<\/p>/gi,
|
||||
];
|
||||
|
||||
for (const pattern of ingredientPatterns) {
|
||||
let match;
|
||||
while ((match = pattern.exec(html)) !== null) {
|
||||
const parsed = this.parseIngredientLine(match[1]);
|
||||
if (parsed && parsed.name.length > 2) {
|
||||
// Undvik mycket korta ingredienser (troligen brus)
|
||||
ingredients.push(parsed);
|
||||
}
|
||||
}
|
||||
if (ingredients.length > 0) break; // Om vi hittat några, använd dessa
|
||||
}
|
||||
|
||||
// Försöka hitta instruktioner
|
||||
let instructions = '';
|
||||
const instructionsPatterns = [
|
||||
/<(?:div|section)[^>]*class="[^"]*(?:instruction|method|step)[^"]*"[^>]*>(.*?)<\/(?:div|section)>/is,
|
||||
/<ol[^>]*>(.*?)<\/ol>/i,
|
||||
];
|
||||
|
||||
for (const pattern of instructionsPatterns) {
|
||||
const match = html.match(pattern);
|
||||
if (match) {
|
||||
instructions = match[1].replace(/<[^>]+>/g, '').trim();
|
||||
if (instructions.length > 10) break;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name,
|
||||
description,
|
||||
ingredients,
|
||||
instructions,
|
||||
imageUrl: ogImage,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -1,219 +0,0 @@
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { RecipeParser, ParsedRecipe } from './base.parser';
|
||||
|
||||
/**
|
||||
* Parser för ica.se receptsidor
|
||||
* Använder JSON-LD structured data som primär källa
|
||||
*/
|
||||
export class IcaRecipeParser extends RecipeParser {
|
||||
private readonly logger = new Logger(IcaRecipeParser.name);
|
||||
canHandle(url: string): boolean {
|
||||
return /ica\.se\/recept/i.test(url);
|
||||
}
|
||||
|
||||
parse(html: string): ParsedRecipe {
|
||||
this.logger.log('Parsing ICA recipe...');
|
||||
|
||||
// Extrahera og:image för bildurl-fallback
|
||||
const ogImage = this.extractOgImage(html);
|
||||
|
||||
// Försöka extrahera JSON-LD recipe data (ICA använder ofta flera script-taggar)
|
||||
const jsonLdRegex =
|
||||
/<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi;
|
||||
let jsonLdMatch: RegExpExecArray | null;
|
||||
while ((jsonLdMatch = jsonLdRegex.exec(html)) !== null) {
|
||||
const rawJson = jsonLdMatch[1]?.trim();
|
||||
if (!rawJson) continue;
|
||||
|
||||
try {
|
||||
const parsedJson = JSON.parse(rawJson);
|
||||
const recipe = this.findRecipeInJsonLd(parsedJson);
|
||||
if (recipe) {
|
||||
this.logger.log('JSON-LD recipe found');
|
||||
return this.extractFromJsonLd(recipe, ogImage);
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.warn(`JSON-LD parsing failed: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: HTML parsing (sällan nödvändigt för ICA)
|
||||
this.logger.log('Falling back to HTML parsing');
|
||||
return this.parseFromHtml(html, ogImage);
|
||||
}
|
||||
|
||||
private findRecipeInJsonLd(jsonData: any): any {
|
||||
if (!jsonData) return null;
|
||||
|
||||
if (Array.isArray(jsonData)) {
|
||||
for (const item of jsonData) {
|
||||
const recipe = this.findRecipeInJsonLd(item);
|
||||
if (recipe) return recipe;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (jsonData['@type'] === 'Recipe') {
|
||||
return jsonData;
|
||||
}
|
||||
|
||||
if (Array.isArray(jsonData['@type']) && jsonData['@type'].includes('Recipe')) {
|
||||
return jsonData;
|
||||
}
|
||||
|
||||
const graph = jsonData['@graph'];
|
||||
if (Array.isArray(graph)) {
|
||||
return graph.find(
|
||||
(item: any) =>
|
||||
item?.['@type'] === 'Recipe' ||
|
||||
(Array.isArray(item?.['@type']) && item['@type'].includes('Recipe')),
|
||||
) ?? null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractOgImage(html: string): string | undefined {
|
||||
const match = html.match(/<meta[^>]+property="og:image"[^>]+content="([^"]+)"/i)
|
||||
|| html.match(/<meta[^>]+content="([^"]+)"[^>]+property="og:image"/i);
|
||||
return match ? this.decodeHtmlEntities(match[1].trim()) : undefined;
|
||||
}
|
||||
|
||||
private decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>');
|
||||
}
|
||||
|
||||
private extractImageValue(image: any): string | undefined {
|
||||
if (!image) return undefined;
|
||||
if (typeof image === 'string') return image;
|
||||
|
||||
if (Array.isArray(image)) {
|
||||
for (const item of image) {
|
||||
const extracted = this.extractImageValue(item);
|
||||
if (extracted) return extracted;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (typeof image === 'object') {
|
||||
return (
|
||||
image.url ||
|
||||
image['@id'] ||
|
||||
image.contentUrl ||
|
||||
image.thumbnailUrl ||
|
||||
this.extractImageValue(image.image)
|
||||
);
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private extractFromJsonLd(recipe: any, ogImage?: string): ParsedRecipe {
|
||||
// Extrahera titel
|
||||
const name = recipe.name || '';
|
||||
|
||||
// Extrahera beskrivning
|
||||
const description = recipe.description || '';
|
||||
|
||||
// Extrahera bildurl från JSON-LD (kan vara sträng eller array)
|
||||
let imageUrl: string | undefined = ogImage;
|
||||
const extractedImage = this.extractImageValue(recipe.image);
|
||||
if (extractedImage) {
|
||||
imageUrl = this.decodeHtmlEntities(extractedImage);
|
||||
}
|
||||
|
||||
// Extrahera ingredienser
|
||||
const ingredients: Array<{ quantity: number; unit: string; name: string; note?: string }> = [];
|
||||
if (recipe.recipeIngredient && Array.isArray(recipe.recipeIngredient)) {
|
||||
for (const ing of recipe.recipeIngredient) {
|
||||
const parsed = this.parseIngredientLine(ing);
|
||||
if (parsed) {
|
||||
ingredients.push(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extrahera instruktioner
|
||||
let instructions = '';
|
||||
if (recipe.recipeInstructions) {
|
||||
if (typeof recipe.recipeInstructions === 'string') {
|
||||
instructions = recipe.recipeInstructions;
|
||||
} else if (Array.isArray(recipe.recipeInstructions)) {
|
||||
instructions = recipe.recipeInstructions
|
||||
.map((step: any) => {
|
||||
if (typeof step === 'string') return step;
|
||||
if (step.text) return step.text;
|
||||
return '';
|
||||
})
|
||||
.filter((s: string) => s)
|
||||
.join('\n\n');
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
name,
|
||||
description,
|
||||
ingredients,
|
||||
instructions,
|
||||
imageUrl,
|
||||
};
|
||||
}
|
||||
|
||||
private parseFromHtml(html: string, ogImage?: string): ParsedRecipe {
|
||||
let name = '';
|
||||
const titleMatch = html.match(/<h1[^>]*>([^<]+)<\/h1>/i);
|
||||
if (titleMatch) {
|
||||
name = titleMatch[1].trim();
|
||||
}
|
||||
|
||||
if (!name) {
|
||||
const ogTitleMatch = html.match(
|
||||
/<meta\s+property="og:title"\s+content="([^"]+)"/i
|
||||
);
|
||||
if (ogTitleMatch) {
|
||||
name = ogTitleMatch[1].trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Extrahera beskrivning från meta-taggar
|
||||
let description = '';
|
||||
const descMatch = html.match(
|
||||
/<meta\s+name="description"\s+content="([^"]+)"/i
|
||||
);
|
||||
if (descMatch) {
|
||||
description = descMatch[1].trim();
|
||||
}
|
||||
|
||||
const ingredients: Array<{ quantity: number; unit: string; name: string; note?: string }> = [];
|
||||
const ingredientRegex =
|
||||
/<li[^>]*class="[^"]*ingredient[^"]*"[^>]*>([^<]+)<\/li>/gi;
|
||||
let match;
|
||||
while ((match = ingredientRegex.exec(html)) !== null) {
|
||||
const parsed = this.parseIngredientLine(match[1]);
|
||||
if (parsed) {
|
||||
ingredients.push(parsed);
|
||||
}
|
||||
}
|
||||
|
||||
let instructions = '';
|
||||
const instructionsMatch = html.match(
|
||||
/<(?:div|section)[^>]*class="[^"]*(?:instruction|howto)[^"]*"[^>]*>([^<]*)<\/(?:div|section)>/is
|
||||
);
|
||||
if (instructionsMatch) {
|
||||
instructions = instructionsMatch[1].replace(/<[^>]+>/g, '').trim();
|
||||
}
|
||||
|
||||
return {
|
||||
name,
|
||||
description,
|
||||
ingredients,
|
||||
instructions,
|
||||
imageUrl: ogImage,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -1,71 +0,0 @@
|
||||
import { Logger } from '@nestjs/common';
|
||||
import * as pdfParse from 'pdf-parse';
|
||||
import { createWorker } from 'tesseract.js';
|
||||
import { RecipeParser, ParsedRecipe } from './base.parser';
|
||||
|
||||
interface ParsedReceiptItem {
|
||||
name: string;
|
||||
quantity: number;
|
||||
price: number;
|
||||
}
|
||||
|
||||
export class ReceiptParser extends RecipeParser {
|
||||
private readonly logger = new Logger(ReceiptParser.name);
|
||||
|
||||
canHandle(url: string): boolean {
|
||||
// This parser is for receipts, not URLs, so it will be used directly in the service
|
||||
return false;
|
||||
}
|
||||
|
||||
parse(_html: string): import('./base.parser').ParsedRecipe {
|
||||
throw new Error('ReceiptParser does not support HTML parsing');
|
||||
}
|
||||
|
||||
async parseFromPdf(buffer: Buffer): Promise<ParsedReceiptItem[]> {
|
||||
try {
|
||||
this.logger.log('Parsing PDF receipt...');
|
||||
const data = await pdfParse(buffer);
|
||||
const text = data.text;
|
||||
return this.parseReceiptText(text);
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to parse PDF receipt', error);
|
||||
throw new Error('Failed to parse PDF receipt');
|
||||
}
|
||||
}
|
||||
|
||||
async parseFromImage(buffer: Buffer): Promise<ParsedReceiptItem[]> {
|
||||
try {
|
||||
this.logger.log('Parsing image receipt...');
|
||||
const worker = await createWorker('eng');
|
||||
const ret = await worker.recognize(buffer);
|
||||
await worker.terminate();
|
||||
const text = ret.data.text;
|
||||
return this.parseReceiptText(text);
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to parse image receipt', error);
|
||||
throw new Error('Failed to parse image receipt');
|
||||
}
|
||||
}
|
||||
|
||||
parseReceiptText(text: string): ParsedReceiptItem[] {
|
||||
this.logger.log('Parsing receipt text...');
|
||||
// Simple parsing logic to extract items from receipt text
|
||||
// This is a placeholder and should be replaced with actual parsing logic
|
||||
const lines = text.split('\n');
|
||||
const items: ParsedReceiptItem[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.trim() === '') continue;
|
||||
// Example parsing logic: "2x Apple 10.00 SEK"
|
||||
const match = line.match(/(\d+)x\s+(.+?)\s+([\d.]+)\s*SEK/);
|
||||
if (match) {
|
||||
const quantity = parseInt(match[1], 10);
|
||||
const name = match[2].trim();
|
||||
const price = parseFloat(match[3]);
|
||||
items.push({ name, quantity, price });
|
||||
}
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Body, Controller, Post, UploadedFile, UseInterceptors } from '@nestjs/common';
|
||||
import { Body, Controller, HttpCode, Post, UploadedFile, UseInterceptors } from '@nestjs/common';
|
||||
import { Throttle } from '@nestjs/throttler';
|
||||
import { FileInterceptor } from '@nestjs/platform-express';
|
||||
import { memoryStorage } from 'multer';
|
||||
@@ -10,6 +10,7 @@ export class QuickImportController {
|
||||
constructor(private readonly quickImportService: QuickImportService) {}
|
||||
|
||||
@Post()
|
||||
@HttpCode(200)
|
||||
@Throttle({ default: { ttl: 60_000, limit: 20 } })
|
||||
@UseInterceptors(
|
||||
FileInterceptor('file', {
|
||||
|
||||
@@ -3,25 +3,43 @@ class ParsedReceiptItem {
|
||||
final String rawName;
|
||||
final double? quantity;
|
||||
final String? unit;
|
||||
final String? suggestedProductId;
|
||||
final double? price;
|
||||
final String? brand;
|
||||
final String? origin;
|
||||
// alias-match (säker, ingen bekräftelse behövs)
|
||||
final int? matchedProductId;
|
||||
final String? matchedProductName;
|
||||
// ordbaserad match (kräver bekräftelse)
|
||||
final int? suggestedProductId;
|
||||
final String? suggestedProductName;
|
||||
final String? categorySuggestion;
|
||||
// AI-kategorisuggestion (premium)
|
||||
final String? categorySuggestionName;
|
||||
|
||||
ParsedReceiptItem({
|
||||
required this.rawName,
|
||||
this.quantity,
|
||||
this.unit,
|
||||
this.price,
|
||||
this.brand,
|
||||
this.origin,
|
||||
this.matchedProductId,
|
||||
this.matchedProductName,
|
||||
this.suggestedProductId,
|
||||
this.suggestedProductName,
|
||||
this.categorySuggestion,
|
||||
this.categorySuggestionName,
|
||||
});
|
||||
|
||||
factory ParsedReceiptItem.fromJson(Map<String, dynamic> json) => ParsedReceiptItem(
|
||||
rawName: json['rawName'] as String,
|
||||
rawName: json['rawName'] as String? ?? '',
|
||||
quantity: (json['quantity'] as num?)?.toDouble(),
|
||||
unit: json['unit'] as String?,
|
||||
suggestedProductId: json['suggestedProductId'] as String?,
|
||||
price: (json['price'] as num?)?.toDouble(),
|
||||
brand: json['brand'] as String?,
|
||||
origin: json['origin'] as String?,
|
||||
matchedProductId: (json['matchedProductId'] as num?)?.toInt(),
|
||||
matchedProductName: json['matchedProductName'] as String?,
|
||||
suggestedProductId: (json['suggestedProductId'] as num?)?.toInt(),
|
||||
suggestedProductName: json['suggestedProductName'] as String?,
|
||||
categorySuggestion: json['categorySuggestion'] as String?,
|
||||
categorySuggestionName: (json['categorySuggestion'] as Map<String, dynamic>?)?['categoryName'] as String?,
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user