feat: Implement PDF recipe parser and quick import service for file and URL inputs

2026-04-14 22:24:28 +02:00
parent e90fd2d670
commit 1ce1318bf5
10 changed files with 758 additions and 194 deletions
@@ -1,46 +1,85 @@
-import { Injectable, BadRequestException } from '@nestjs/common';
+import {
+  BadRequestException,
+  Injectable,
+  ServiceUnavailableException,
+  UnsupportedMediaTypeException,
+} from '@nestjs/common';
+import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
+import * as pdfParse from 'pdf-parse';
+import { createWorker } from 'tesseract.js';
 import { IcaRecipeParser } from './parsers/ica.parser';
 import { GenericRecipeParser } from './parsers/generic.parser';
 import { RecipeParser } from './parsers/base.parser';

 export interface QuickImportResult {
  markdown: string;
-  source: 'ica' | 'pdf' | 'other';
+  source: 'ica' | 'pdf' | 'image' | 'other';
 }

+type UploadKind = 'pdf' | 'image';
+
@Injectable()
 export class QuickImportService {
  /**
   * Detekterar typ av input (URL eller filsökväg) och importerar från lämplig källa
   */
  async importFromInput(input: string): Promise<QuickImportResult> {
-    input = input.trim();
-    console.log('[QuickImport] Mottog input:', input);
+    const trimmed = input.trim();
+    console.log('[QuickImport] Mottog input:', trimmed);

-    if (!input) {
-      throw new BadRequestException('Du måste ange en URL eller filsökväg');
+    if (!trimmed) {
+      throw new BadRequestException('Du måste ange en URL eller ladda upp en fil');
    }

-    // Detektera typ
-    const isUrl = this.isUrl(input);
-    const isPdf = this.isPdfPath(input);
-
-    console.log('[QuickImport] isUrl:', isUrl, 'isPdf:', isPdf);
-
-    if (isUrl) {
+    if (this.isUrl(trimmed)) {
      console.log('[QuickImport] Detekterade URL, försöker scrapa...');
-      return this.scrapeRecipeFromUrl(input);
-    } else if (isPdf) {
-      console.log('[QuickImport] Detekterade PDF-fil');
-      throw new BadRequestException(
-        'PDF-import under utveckling. Försök med en URL från ICA.se eller annat receptsida.'
-      );
-    } else {
-      console.log('[QuickImport] Input är inte URL eller PDF');
-      throw new BadRequestException(
-        'Ogültig input. Ange en gyltig URL (t.ex. ica.se/recept/...) eller filsökväg'
-      );
+      return this.scrapeRecipeFromUrl(trimmed);
    }
+
+    if (this.looksLikeLocalFile(trimmed)) {
+      console.log('[QuickImport] Försöker läsa lokal fil:', trimmed);
+      try {
+        const buffer = await fs.readFile(trimmed);
+        return this.importFromUpload({
+          buffer,
+          originalname: path.basename(trimmed),
+          mimetype: this.getMimeTypeFromExtension(trimmed),
+        } as Express.Multer.File);
+      } catch (error) {
+        console.error('[QuickImport] Kunde inte läsa lokal fil:', error);
+        throw new BadRequestException(
+          'Kunde inte läsa filen. Använd filuppladdning i gränssnittet eller kontrollera sökvägen.',
+        );
+      }
+    }
+
+    throw new BadRequestException(
+      'Ogiltig input. Ange en giltig URL eller ladda upp en PDF- eller bildfil.',
+    );
+  }
+
+  async importFromUpload(file: Express.Multer.File): Promise<QuickImportResult> {
+    if (!file?.buffer) {
+      throw new BadRequestException('Ingen fil skickades med.');
+    }
+
+    console.log('[QuickImport] Mottog uppladdad fil:', file.originalname, file.mimetype);
+    const kind = this.getUploadKind(file);
+
+    if (kind === 'pdf') {
+      const text = await this.extractTextFromPdf(file.buffer);
+      return {
+        markdown: this.normalizeImportedTextToMarkdown(text, file.originalname),
+        source: 'pdf',
+      };
+    }
+
+    const text = await this.extractTextFromImage(file.buffer);
+    return {
+      markdown: this.normalizeImportedTextToMarkdown(text, file.originalname),
+      source: 'image',
+    };
  }

  /**
@@ -55,12 +94,157 @@ export class QuickImportService {
    }
  }

-  /**
-   * Kontrollerar om input är en PDF-filsökväg
-   */
-  private isPdfPath(input: string): boolean {
+  private looksLikeLocalFile(input: string): boolean {
    const normalized = input.toLowerCase();
-    return normalized.endsWith('.pdf');
+    return /[\\/]/.test(input) || /\.(pdf|png|jpg|jpeg|webp|bmp)$/i.test(normalized);
+  }
+
+  private getMimeTypeFromExtension(filename: string): string {
+    const ext = path.extname(filename).toLowerCase();
+
+    if (ext === '.pdf') return 'application/pdf';
+    if (ext === '.png') return 'image/png';
+    if (ext === '.jpg' || ext === '.jpeg') return 'image/jpeg';
+    if (ext === '.webp') return 'image/webp';
+    if (ext === '.bmp') return 'image/bmp';
+
+    return 'application/octet-stream';
+  }
+
+  private getUploadKind(
+    file: Pick<Express.Multer.File, 'mimetype' | 'originalname'>,
+  ): UploadKind {
+    const type = (file.mimetype ?? '').toLowerCase();
+    const name = (file.originalname ?? '').toLowerCase();
+
+    if (type.includes('pdf') || name.endsWith('.pdf')) {
+      return 'pdf';
+    }
+
+    if (
+      type.startsWith('image/') ||
+      ['.png', '.jpg', '.jpeg', '.webp', '.bmp'].some((ext) => name.endsWith(ext))
+    ) {
+      return 'image';
+    }
+
+    throw new UnsupportedMediaTypeException(
+      'Endast PDF, PNG, JPG, JPEG, WEBP och BMP stöds.',
+    );
+  }
+
+  private async extractTextFromPdf(buffer: Buffer): Promise<string> {
+    try {
+      const result = await pdfParse(buffer);
+      const text = result.text?.replace(/\u0000/g, '').trim();
+
+      if (!text) {
+        throw new BadRequestException(
+          'PDF-filen saknar läsbar text. Prova bildimport om det är en skannad sida.',
+        );
+      }
+
+      return text;
+    } catch (error) {
+      if (error instanceof BadRequestException) {
+        throw error;
+      }
+
+      console.error('[QuickImport] PDF ERROR:', error);
+      throw new ServiceUnavailableException('PDF-importen misslyckades.');
+    }
+  }
+
+  private async extractTextFromImage(buffer: Buffer): Promise<string> {
+    const worker = await createWorker('swe+eng');
+
+    try {
+      const result = await worker.recognize(buffer);
+      const text = result.data.text?.trim();
+
+      if (!text) {
+        throw new BadRequestException('Ingen text hittades i bilden.');
+      }
+
+      return text;
+    } catch (error) {
+      if (error instanceof BadRequestException) {
+        throw error;
+      }
+
+      console.error('[QuickImport] OCR ERROR:', error);
+      throw new ServiceUnavailableException('OCR-importen misslyckades.');
+    } finally {
+      await worker.terminate();
+    }
+  }
+
+  private normalizeImportedTextToMarkdown(text: string, sourceName?: string): string {
+    const cleanedText = text
+      .replace(/\r/g, '')
+      .replace(/[ \t]+/g, ' ')
+      .replace(/\n{3,}/g, '\n\n')
+      .trim();
+
+    if (!cleanedText) {
+      throw new BadRequestException('Ingen läsbar text hittades i filen.');
+    }
+
+    const title = cleanedText.split('\n').find((line) => line.trim().length > 3)?.trim() ?? 'Importerat recept';
+    const ingredients: string[] = [];
+    const instructions: string[] = [];
+    let section: 'unknown' | 'ingredients' | 'instructions' = 'unknown';
+
+    for (const rawLine of cleanedText.split('\n')) {
+      const line = rawLine.trim();
+      if (!line || line === title) {
+        continue;
+      }
+
+      const lower = line.toLowerCase();
+
+      if (/^ingred/i.test(lower)) {
+        section = 'ingredients';
+        continue;
+      }
+
+      if (/^(gör så här|gor sa har|instruktioner|tillvägagångssätt|tillvagagangssatt|method|instructions)/i.test(lower)) {
+        section = 'instructions';
+        continue;
+      }
+
+      if (section === 'unknown') {
+        section = this.looksLikeIngredientLine(line) ? 'ingredients' : 'instructions';
+      }
+
+      if (section === 'ingredients') {
+        ingredients.push(line.startsWith('-') ? line : `- ${line}`);
+      } else {
+        instructions.push(line);
+      }
+    }
+
+    return [
+      `# ${title}`,
+      '',
+      '## Ingredienser',
+      ...(ingredients.length > 0 ? ingredients : ['- Komplettera ingredienser manuellt']),
+      '',
+      '## Tillvägagångssätt',
+      ...(instructions.length > 0 ? instructions : ['Komplettera tillagningsstegen manuellt.']),
+      '',
+      sourceName ? `Källa: ${sourceName}` : '',
+    ]
+      .filter(Boolean)
+      .join('\n');
+  }
+
+  private looksLikeIngredientLine(line: string): boolean {
+    return (
+      /^[-*•]\s+/.test(line) ||
+      /^\d+[.,]?\d*\s+/.test(line) ||
+      /\b(g|kg|hg|mg|ml|dl|cl|l|tsk|msk|krm|st|pkt|förp|klyfta)\b/i.test(line)
+    );
  }

  /**
@@ -76,7 +260,6 @@ export class QuickImportService {
    try {
      console.log('[QuickImport] Hämtar HTML från:', url);

-      // Hämta HTML från URL
      const response = await fetch(url, {
        headers: {
          'User-Agent':
@@ -93,7 +276,6 @@ export class QuickImportService {
      const html = await response.text();
      console.log('[QuickImport] HTML längd:', html.length, 'tecken');

-      // Välj lämplig parser
      const parsers: RecipeParser[] = [
        new IcaRecipeParser(),
        new GenericRecipeParser(),
@@ -121,12 +303,10 @@ export class QuickImportService {
        throw new Error('Kunde inte hitta receptnamn på sidan. Försök med en annan länk.');
      }

-      // Konvertera till Markdown-format
      const markdown = this.recipeToMarkdown(recipe, url);
      console.log('[QuickImport] Markdown genererad, längd:', markdown.length);

-      // Detektera källa från URL
-      let source: 'ica' | 'pdf' | 'other' = 'other';
+      let source: 'ica' | 'pdf' | 'image' | 'other' = 'other';
      if (/ica\.se/i.test(url)) {
        source = 'ica';
      }
@@ -163,17 +343,14 @@ export class QuickImportService {
  ): string {
    const lines: string[] = [];

-    // Titel
    lines.push(`# ${recipe.name}`);
    lines.push('');

-    // Beskrivning
    if (recipe.description) {
      lines.push(recipe.description);
      lines.push('');
    }

-    // Ingredienser
    if (recipe.ingredients.length > 0) {
      lines.push('## Ingredienser');
      for (const ing of recipe.ingredients) {
@@ -185,14 +362,12 @@ export class QuickImportService {
      lines.push('');
    }

-    // Instruktioner
    if (recipe.instructions) {
      lines.push('## Tillvägagångssätt');
      lines.push(recipe.instructions);
      lines.push('');
    }

-    // Källa
    if (sourceUrl) {
      lines.push('---');
      lines.push('');