feat: Implement site-specific recipe parsers for ICA and generic fallback

2026-04-12 09:39:32 +02:00
parent 2c92e07d39
commit 4e2616fe2e
5 changed files with 414 additions and 86 deletions
@@ -0,0 +1,57 @@
+# Site-Specifika Parsers
+
+Denna mapp innehåller parsers för olika receptsidor. Varje webbplats kan ha sina egna selectors och datastrukturer.
+
+## Arkitektur
+
+- **`base.parser.ts`** – Bas-klass som alla parsers extendar
+- **`ica.parser.ts`** – Optimerad parser för ica.se
+- **`generic.parser.ts`** – Generisk fallback-parser för okända sidor
+
+## Så lägger du till en ny parser
+
+1. Skapa en ny fil, t.ex. `mathem.parser.ts`:
+
+```typescript
+import { RecipeParser, ParsedRecipe } from './base.parser';
+
+export class MathemsRecipeParser extends RecipeParser {
+  canHandle(url: string): boolean {
+    return /mathem\.se/i.test(url); // Matchar bara mathem.se-URLs
+  }
+
+  parse(html: string): ParsedRecipe {
+    // Din site-specifika parsing-logik här
+    // Returnera { name, ingredients, instructions }
+  }
+}
+```
+
+2. Registrera parsern i `quick-import.service.ts`:
+
+```typescript
+const parsers: RecipeParser[] = [
+  new IcaRecipeParser(),
+  new MathemsRecipeParser(), // Din nya parser här
+  new GenericRecipeParser(), // Måste vara sist (fallback)
+];
+```
+
+## Bästa praxis
+
+- **I18n**: Använd svenska användarmeddelanden
+- **Säkerhet**: Sanitera HTML-output innan du använder det
+- **Robustness**: Testa edge-cases (tomma ingredienser, långa instruktioner)
+- **Prioritering**: Mer specifika parsers måste komma före generiska
+
+## Tips för att debugga en ny sida
+
+1. Kolla om sidan använder JSON-LD: Öppna DevTools → Sök efter `<script type="application/ld+json">`
+2. Om JSON-LD finns → kopiera strukturen och anpassa `parseIngredientLine()`
+3. Om inte → analysera HTML-strukturen och justera CSS-selectors
+
+## Framtida förbättringar
+
+- [ ] Stöd för Puppeteer/Playwright för JavaScript-heavy webbplatser
+- [ ] Plugin-system för community-bidrag
+- [ ] Tester per parser
@@ -0,0 +1,53 @@
+/**
+ * Bas-parser för receptsidor
+ * Alla site-specifika parsers bör extenda denna
+ */
+export interface ParsedRecipe {
+  name: string;
+  description?: string;
+  ingredients: Array<{
+    quantity: number;
+    unit: string;
+    name: string;
+  }>;
+  instructions?: string;
+}
+
+export abstract class RecipeParser {
+  /**
+   * Kontrollera om denna parser kan hantera denna URL
+   */
+  abstract canHandle(url: string): boolean;
+
+  /**
+   * Parsa HTML och extrahera receptdata
+   */
+  abstract parse(html: string): ParsedRecipe;
+
+  /**
+   * Hjälpfunktion: parsa ingrediens-rad
+   */
+  protected parseIngredientLine(line: string): {
+    quantity: number;
+    unit: string;
+    name: string;
+  } | null {
+    const cleaned = line.replace(/<[^>]+>/g, '').trim();
+    if (!cleaned) return null;
+
+    const match = cleaned.match(/^([\d.,]+)?\s*([a-zåäö]*)\s*(.+)$/i);
+    if (!match) {
+      return {
+        quantity: 0,
+        unit: 'st',
+        name: cleaned,
+      };
+    }
+
+    return {
+      quantity: match[1] ? parseFloat(match[1].replace(',', '.')) : 0,
+      unit: (match[2] || 'st').toLowerCase().trim(),
+      name: match[3].trim(),
+    };
+  }
+}
@@ -0,0 +1,136 @@
+import { RecipeParser, ParsedRecipe } from './base.parser';
+
+/**
+ * Generisk parser för okända receptsidor
+ * Försöker JSON-LD först, sedan vanlig HTML-parsing
+ * Denna är mer permissiv än site-specifika parsers
+ */
+export class GenericRecipeParser extends RecipeParser {
+  canHandle(url: string): boolean {
+    // Denna parser hanterar alltid (är fallback)
+    return true;
+  }
+
+  parse(html: string): ParsedRecipe {
+    console.log('[GenericParser] Parsing recipe from unknown site...');
+
+    // Försöka extrahera JSON-LD recipe data
+    const jsonLdMatch = html.match(
+      /<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/i
+    );
+
+    if (jsonLdMatch) {
+      try {
+        const jsonData = JSON.parse(jsonLdMatch[1]);
+        const recipe =
+          jsonData['@type'] === 'Recipe'
+            ? jsonData
+            : jsonData['@graph']?.find((item) => item['@type'] === 'Recipe');
+
+        if (recipe) {
+          console.log('[GenericParser] ✓ JSON-LD data found');
+          return this.extractFromJsonLd(recipe);
+        }
+      } catch (err) {
+        console.log('[GenericParser] JSON-LD parsing failed');
+      }
+    }
+
+    console.log('[GenericParser] No JSON-LD found, using HTML parsing');
+    return this.parseFromHtml(html);
+  }
+
+  private extractFromJsonLd(recipe: any): ParsedRecipe {
+    const name = recipe.name || '';
+
+    const ingredients: Array<{ quantity: number; unit: string; name: string }> = [];
+    if (recipe.recipeIngredient && Array.isArray(recipe.recipeIngredient)) {
+      for (const ing of recipe.recipeIngredient) {
+        const parsed = this.parseIngredientLine(ing);
+        if (parsed) {
+          ingredients.push(parsed);
+        }
+      }
+    }
+
+    let instructions = '';
+    if (recipe.recipeInstructions) {
+      if (typeof recipe.recipeInstructions === 'string') {
+        instructions = recipe.recipeInstructions;
+      } else if (Array.isArray(recipe.recipeInstructions)) {
+        instructions = recipe.recipeInstructions
+          .map((step) => {
+            if (typeof step === 'string') return step;
+            if (step.text) return step.text;
+            return '';
+          })
+          .filter((s) => s)
+          .join('\n\n');
+      }
+    }
+
+    return {
+      name,
+      ingredients,
+      instructions,
+    };
+  }
+
+  private parseFromHtml(html: string): ParsedRecipe {
+    // Försöka hitta titel
+    let name = '';
+    
+    // Prova olika selector-mönster
+    let titleMatch =
+      html.match(/<h1[^>]*>([^<]+)<\/h1>/i) ||
+      html.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i) ||
+      html.match(/<title>([^<]+)<\/title>/i);
+
+    if (titleMatch) {
+      name = titleMatch[1].trim();
+    }
+
+    // Försöka extrahera ingredienser från vanliga strukturer
+    const ingredients: Array<{ quantity: number; unit: string; name: string }> = [];
+    
+    // Testa olika ingredient-selectors
+    const ingredientPatterns = [
+      /<li[^>]*>(.*?)<\/li>/gi,
+      /<div[^>]*class="ingredient"[^>]*>(.*?)<\/div>/gi,
+      /<p[^>]*class="ingredient"[^>]*>(.*?)<\/p>/gi,
+    ];
+
+    for (const pattern of ingredientPatterns) {
+      let match;
+      while ((match = pattern.exec(html)) !== null) {
+        const parsed = this.parseIngredientLine(match[1]);
+        if (parsed && parsed.name.length > 2) {
+          // Undvik mycket korta ingredienser (troligen brus)
+          ingredients.push(parsed);
+        }
+      }
+      if (ingredients.length > 0) break; // Om vi hittat några, använd dessa
+    }
+
+    // Försöka hitta instruktioner
+    let instructions = '';
+    const instructionsPatterns = [
+      /<(?:div|section)[^>]*class="[^"]*(?:instruction|method|step)[^"]*"[^>]*>(.*?)<\/(?:div|section)>/is,
+      /<ol[^>]*>(.*?)<\/ol>/i,
+    ];
+
+    for (const pattern of instructionsPatterns) {
+      const match = html.match(pattern);
+      if (match) {
+        instructions = match[1].replace(/<[^>]+>/g, '').trim();
+        if (instructions.length > 10) break;
+      }
+    }
+
+    return {
+      name,
+      ingredients,
+      instructions,
+    };
+  }
+}
@@ -0,0 +1,124 @@
+import { RecipeParser, ParsedRecipe } from './base.parser';
+
+/**
+ * Parser för ica.se receptsidor
+ * Använder JSON-LD structured data som primär källa
+ */
+export class IcaRecipeParser extends RecipeParser {
+  canHandle(url: string): boolean {
+    return /ica\.se\/recept/i.test(url);
+  }
+
+  parse(html: string): ParsedRecipe {
+    console.log('[IcaParser] Parsing ICA recipe...');
+
+    // Försöka extrahera JSON-LD recipe data (ICA använder detta)
+    const jsonLdMatch = html.match(
+      /<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/i
+    );
+
+    if (jsonLdMatch) {
+      try {
+        const jsonData = JSON.parse(jsonLdMatch[1]);
+
+        // Hitta recipe-objektet
+        const recipe =
+          jsonData['@type'] === 'Recipe'
+            ? jsonData
+            : jsonData['@graph']?.find((item) => item['@type'] === 'Recipe');
+
+        if (recipe) {
+          console.log('[IcaParser] ✓ JSON-LD recipe found');
+          return this.extractFromJsonLd(recipe);
+        }
+      } catch (err) {
+        console.log('[IcaParser] JSON-LD parsing failed:', err);
+      }
+    }
+
+    // Fallback: HTML parsing (sällan nödvändigt för ICA)
+    console.log('[IcaParser] Falling back to HTML parsing');
+    return this.parseFromHtml(html);
+  }
+
+  private extractFromJsonLd(recipe: any): ParsedRecipe {
+    // Extrahera titel
+    const name = recipe.name || '';
+
+    // Extrahera ingredienser
+    const ingredients: Array<{ quantity: number; unit: string; name: string }> = [];
+    if (recipe.recipeIngredient && Array.isArray(recipe.recipeIngredient)) {
+      for (const ing of recipe.recipeIngredient) {
+        const parsed = this.parseIngredientLine(ing);
+        if (parsed) {
+          ingredients.push(parsed);
+        }
+      }
+    }
+
+    // Extrahera instruktioner
+    let instructions = '';
+    if (recipe.recipeInstructions) {
+      if (typeof recipe.recipeInstructions === 'string') {
+        instructions = recipe.recipeInstructions;
+      } else if (Array.isArray(recipe.recipeInstructions)) {
+        instructions = recipe.recipeInstructions
+          .map((step) => {
+            if (typeof step === 'string') return step;
+            if (step.text) return step.text;
+            return '';
+          })
+          .filter((s) => s)
+          .join('\n\n');
+      }
+    }
+
+    return {
+      name,
+      ingredients,
+      instructions,
+    };
+  }
+
+  private parseFromHtml(html: string): ParsedRecipe {
+    let name = '';
+    const titleMatch = html.match(/<h1[^>]*>([^<]+)<\/h1>/i);
+    if (titleMatch) {
+      name = titleMatch[1].trim();
+    }
+
+    if (!name) {
+      const ogTitleMatch = html.match(
+        /<meta\s+property="og:title"\s+content="([^"]+)"/i
+      );
+      if (ogTitleMatch) {
+        name = ogTitleMatch[1].trim();
+      }
+    }
+
+    const ingredients: Array<{ quantity: number; unit: string; name: string }> = [];
+    const ingredientRegex =
+      /<li[^>]*class="[^"]*ingredient[^"]*"[^>]*>([^<]+)<\/li>/gi;
+    let match;
+    while ((match = ingredientRegex.exec(html)) !== null) {
+      const parsed = this.parseIngredientLine(match[1]);
+      if (parsed) {
+        ingredients.push(parsed);
+      }
+    }
+
+    let instructions = '';
+    const instructionsMatch = html.match(
+      /<(?:div|section)[^>]*class="[^"]*(?:instruction|howto)[^"]*"[^>]*>([^<]*)<\/(?:div|section)>/is
+    );
+    if (instructionsMatch) {
+      instructions = instructionsMatch[1].replace(/<[^>]+>/g, '').trim();
+    }
+
+    return {
+      name,
+      ingredients,
+      instructions,
+    };
+  }
+}
@@ -1,4 +1,7 @@
 import { Injectable, BadRequestException } from '@nestjs/common';
+import { IcaRecipeParser } from './parsers/ica.parser';
+import { GenericRecipeParser } from './parsers/generic.parser';
+import { RecipeParser } from './parsers/base.parser';

 export interface QuickImportResult {
  markdown: string;
@@ -25,20 +28,12 @@ export class QuickImportService {
    console.log('[QuickImport] isUrl:', isUrl, 'isPdf:', isPdf);

    if (isUrl) {
-      // Försök detektera webbplats
-      if (input.includes('ica.se')) {
-        console.log('[QuickImport] Detekterade ICA-länk, startar skrapning...');
-        return this.scrapeIcaRecipe(input);
-      } else {
-        console.log('[QuickImport] URL är inte från ICA.se');
-        throw new BadRequestException(
-          'Endast ICA-recept stöds för närvarande. Försök med en ICA-länk (ica.se)'
-        );
-      }
+      console.log('[QuickImport] Detekterade URL, försöker scrapa...');
+      return this.scrapeRecipeFromUrl(input);
    } else if (isPdf) {
-      console.log('[QuickImport] PDF-fil identifierad');
+      console.log('[QuickImport] Detekterade PDF-fil');
      throw new BadRequestException(
-        'PDF-import är under utveckling. Använd snabbimport för ICA-recept eller skriv in receptet manuellt.'
+        'PDF-import under utveckling. Försök med en URL från ICA.se eller annat receptsida.'
      );
    } else {
      console.log('[QuickImport] Input är inte URL eller PDF');
@@ -69,17 +64,15 @@ export class QuickImportService {
  }

  /**
-   * Skrapar recept från ICA.se
+   * Skrapar recept från en URL
   *
-   * Försöker hämta:
-   * - Recepttitel (från h1 eller meta title)
-   * - Ingredienser (från ingrediens-lista)
-   * - Instruktioner (från steg-lista eller beskrivning)
+   * Använder site-specifika parsers om tillgängliga,
+   * annars fallback till generisk parser.
   *
-   * @param url ICA-receptlänk
+   * @param url URL till receptsidan
   * @returns Markdown-format
   */
-  private async scrapeIcaRecipe(url: string): Promise<QuickImportResult> {
+  private async scrapeRecipeFromUrl(url: string): Promise<QuickImportResult> {
    try {
      console.log('[QuickImport] Hämtar HTML från:', url);

@@ -100,9 +93,29 @@ export class QuickImportService {
      const html = await response.text();
      console.log('[QuickImport] HTML längd:', html.length, 'tecken');

-      // Extrahera receptinformation från HTML
-      const recipe = this.parseIcaHtml(html);
-      console.log('[QuickImport] Parsad recept:', { name: recipe.name, ingredienser: recipe.ingredients.length });
+      // Välj lämplig parser
+      const parsers: RecipeParser[] = [
+        new IcaRecipeParser(),
+        new GenericRecipeParser(),
+      ];
+
+      let recipe = null;
+      for (const parser of parsers) {
+        if (parser.canHandle(url)) {
+          console.log('[QuickImport] Använder parser:', parser.constructor.name);
+          recipe = parser.parse(html);
+          break;
+        }
+      }
+
+      if (!recipe) {
+        throw new Error('Ingen parserutrustning tillgänglig');
+      }
+
+      console.log('[QuickImport] Parsad recept:', {
+        name: recipe.name,
+        ingredienser: recipe.ingredients.length,
+      });

      if (!recipe.name) {
        throw new Error('Kunde inte hitta receptnamn på sidan. Försök med en annan länk.');
@@ -112,80 +125,25 @@ export class QuickImportService {
      const markdown = this.recipeToMarkdown(recipe);
      console.log('[QuickImport] Markdown genererad, längd:', markdown.length);

+      // Detektera källa från URL
+      let source: 'ica' | 'pdf' | 'other' = 'other';
+      if (/ica\.se/i.test(url)) {
+        source = 'ica';
+      }
+
      return {
        markdown,
-        source: 'ica',
+        source,
      };
    } catch (err) {
      const message = err instanceof Error ? err.message : 'Okänt fel vid scraping';
      console.error('[QuickImport] ERROR:', message);
      throw new BadRequestException(
-        `Kunde inte hämta recept från ICA: ${message}. Kontrollera att länken är korrekt och försök igen.`
+        `Kunde inte hämta recept: ${message}. Kontrollera att länken är korrekt och försök igen.`
      );
    }
  }

-  /**
-   * Parsa ICA-receptsida (HTML)
-   *
-   * Denna är en simplified version. För full produrktion behöver du:
-   * - Headless browser (Puppeteer/Playwright)
-   * - API-integration eller scraping-bibliotek
-   * - Proper error handling för sidstruktur-ändringar
-   */
-  private parseIcaHtml(html: string): {
-    name: string;
-    description?: string;
-    ingredients: Array<{
-      quantity: number;
-      unit: string;
-      name: string;
-    }>;
-    instructions?: string;
-  } {
-    // Extrahera titel
-    let name = '';
-    const titleMatch = html.match(/<h1[^>]*>([^<]+)<\/h1>/i);
-    if (titleMatch) {
-      name = titleMatch[1].trim();
-    }
-
-    if (!name) {
-      const ogTitleMatch = html.match(/<meta\s+property="og:title"\s+content="([^"]+)"/i);
-      if (ogTitleMatch) {
-        name = ogTitleMatch[1].trim();
-      }
-    }
-
-    // Extrahera ingredienser (en enkel regex - kan behöva anpassas)
-    const ingredients: Array<{ quantity: number; unit: string; name: string }> = [];
-    const ingredientRegex = /(?:ingredients?|<li[^>]*>)([^<]*?(\d+(?:[.,]\d+)?)\s*([a-zåäö]*)\s*([^<]+))/gi;
-    let match;
-    while ((match = ingredientRegex.exec(html)) !== null) {
-      const quantity = parseFloat(match[2].replace(',', '.'));
-      const unit = match[3].toLowerCase().trim() || 'st';
-      const name = match[4].trim();
-      if (name) {
-        ingredients.push({ quantity, unit, name });
-      }
-    }
-
-    // Extrahera instruktioner (första paragraf eller instruktions-sektion)
-    let instructions = '';
-    const instructionsMatch = html.match(
-      /<(?:div|section)[^>]*class="[^"]*instruction[^"]*"[^>]*>([^<]*)<\/(?:div|section)>/is
-    );
-    if (instructionsMatch) {
-      instructions = instructionsMatch[1].replace(/<[^>]+>/g, '').trim();
-    }
-
-    return {
-      name,
-      ingredients: ingredients.length > 0 ? ingredients : [],
-      instructions,
-    };
-  }
-
  /**
   * Konvertera receptobjekt till Markdown-format
   */