feat: Implement PDF document import functionality with Markdown conversion

- Added DocumentImportModule, DocumentImportController, and DocumentImportService for handling PDF uploads. - Integrated pdf-parse for extracting text from PDF files. - Created PdfParser for parsing PDF documents and converting them to Markdown format. - Updated frontend to support file uploads via drag-and-drop and file input for PDF documents. - Modified API routes to handle document import requests. - Enhanced error handling for unsupported file types and file size limits. - Updated README to reflect new features and usage instructions.
2026-04-12 18:57:40 +02:00
parent a1a4f9beb3
commit e18bf79395
10 changed files with 538 additions and 290 deletions
@@ -1,9 +1,11 @@
 import { Module } from '@nestjs/common';
 import { QuickImportModule } from './quick-import/quick-import.module';
 import { RecipesModule } from './recipes/recipes.module';
+import { DocumentImportModule } from './document-import/document-import.module';

@Module({
  imports: [
+    DocumentImportModule,
    QuickImportModule,
    RecipesModule,
  ],
@@ -0,0 +1,37 @@
+import {
+  Controller,
+  Post,
+  UploadedFile,
+  UseInterceptors,
+  BadRequestException,
+} from '@nestjs/common';
+import { FileInterceptor } from '@nestjs/platform-express';
+import { DocumentImportService, DocumentImportResult } from './document-import.service';
+
+@Controller('api/document-import')
+export class DocumentImportController {
+  constructor(private readonly documentImportService: DocumentImportService) {}
+
+  /**
+   * POST /api/document-import
+   * Ladda upp en PDF-fil och konvertera till Markdown
+   * Förväntar multipart/form-data med fältet "file"
+   */
+  @Post()
+  @UseInterceptors(
+    FileInterceptor('file', {
+      limits: { fileSize: 50 * 1024 * 1024 }, // 50 MB gräns på multer-nivå
+    })
+  )
+  async importDocument(
+    @UploadedFile() file: Express.Multer.File
+  ): Promise<DocumentImportResult> {
+    if (!file) {
+      throw new BadRequestException(
+        'Ingen fil mottagen. Skicka en PDF-fil med fältet "file" i multipart/form-data.'
+      );
+    }
+
+    return this.documentImportService.importFromFile(file);
+  }
+}
@@ -0,0 +1,9 @@
+import { Module } from '@nestjs/common';
+import { DocumentImportController } from './document-import.controller';
+import { DocumentImportService } from './document-import.service';
+
+@Module({
+  controllers: [DocumentImportController],
+  providers: [DocumentImportService],
+})
+export class DocumentImportModule {}
@@ -0,0 +1,55 @@
+import { Injectable, BadRequestException } from '@nestjs/common';
+import { PdfParser } from './parsers/pdf.parser';
+
+export interface DocumentImportResult {
+  markdown: string;
+  title: string;
+  documentType: 'pdf';
+  metadata?: Record<string, unknown>;
+}
+
+const MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024; // 50 MB
+const ALLOWED_MIME_TYPES = ['application/pdf'];
+
+@Injectable()
+export class DocumentImportService {
+  private readonly pdfParser = new PdfParser();
+
+  async importFromFile(file: Express.Multer.File): Promise<DocumentImportResult> {
+    console.log(
+      '[DocumentImport] Mottog fil:',
+      file.originalname,
+      '— Typ:',
+      file.mimetype,
+      '— Storlek:',
+      file.size,
+      'bytes'
+    );
+
+    if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
+      throw new BadRequestException(
+        `Filtypen "${file.mimetype}" stöds inte. Endast PDF-filer accepteras för tillfället.`
+      );
+    }
+
+    if (file.size > MAX_FILE_SIZE_BYTES) {
+      throw new BadRequestException(
+        `Filen är för stor (${Math.round(file.size / 1024 / 1024)} MB). Maximal filstorlek är 50 MB.`
+      );
+    }
+
+    try {
+      const parsed = await this.pdfParser.parse(file.buffer, file.originalname);
+      return {
+        markdown: parsed.content,
+        title: parsed.title,
+        documentType: 'pdf',
+        metadata: parsed.metadata,
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : 'Okänt fel vid parsning';
+      console.error('[DocumentImport] Parse-fel för', file.originalname, ':', message);
+      throw new BadRequestException(`Kunde inte läsa dokumentet: ${message}`);
+    }
+  }
+}
@@ -0,0 +1,44 @@
+/**
+ * Abstract bas för document parsers
+ * Alla dokumenttyp-specifika parsers bör extenda denna
+ */
+export interface ParsedDocument {
+  title: string;
+  content: string; // Markdown-format
+  metadata?: Record<string, unknown>;
+}
+
+export abstract class DocumentParser {
+  /**
+   * Parsa document-buffer och returnera strukturerad data
+   */
+  abstract parse(buffer: Buffer, filename: string): Promise<ParsedDocument>;
+
+  /**
+   * Konverterar fritext till Markdown
+   * Slår ihop sammanhängande textrader, bevarar stycken
+   */
+  protected textToMarkdown(text: string, title: string): string {
+    const lines = text.split('\n').map(l => l.trim());
+    const paragraphs: string[] = [];
+    let currentParagraph: string[] = [];
+
+    for (const line of lines) {
+      if (line.length === 0) {
+        if (currentParagraph.length > 0) {
+          paragraphs.push(currentParagraph.join(' '));
+          currentParagraph = [];
+        }
+      } else {
+        currentParagraph.push(line);
+      }
+    }
+
+    if (currentParagraph.length > 0) {
+      paragraphs.push(currentParagraph.join(' '));
+    }
+
+    const body = paragraphs.filter(p => p.length > 0).join('\n\n');
+    return `# ${title}\n\n${body}`;
+  }
+}
@@ -0,0 +1,48 @@
+import * as pdfParse from 'pdf-parse';
+import { DocumentParser, ParsedDocument } from './document.parser';
+
+export class PdfParser extends DocumentParser {
+  async parse(buffer: Buffer, filename: string): Promise<ParsedDocument> {
+    console.log('[PdfParser] Parsing:', filename, '— Storlek:', buffer.length, 'bytes');
+
+    let data: Awaited<ReturnType<typeof pdfParse>>;
+
+    try {
+      data = await pdfParse(buffer);
+    } catch (err) {
+      // Lösenordsskyddade eller skadade PDFs
+      const message = err instanceof Error ? err.message : String(err);
+      if (message.toLowerCase().includes('password')) {
+        throw new Error('PDF-filen är lösenordsskyddad och kan inte läsas');
+      }
+      throw new Error(`Kunde inte läsa PDF: ${message}`);
+    }
+
+    const hasText = data.text && data.text.trim().length > 20;
+
+    if (!hasText) {
+      // Textextraction gav ingenting — troligtvis en skannad bild-PDF
+      throw new Error(
+        'PDFen verkar vara en skannad bild utan textlager. OCR-stöd kommer i nästa version.'
+      );
+    }
+
+    console.log(
+      `[PdfParser] Extraherade ${data.numpages} sidor, ${data.text.length} tecken från ${filename}`
+    );
+
+    const title = filename.replace(/\.pdf$/i, '').replace(/[_-]+/g, ' ').trim();
+    const markdown = this.textToMarkdown(data.text, title);
+
+    return {
+      title,
+      content: markdown,
+      metadata: {
+        pageCount: data.numpages,
+        producer: (data.info as Record<string, unknown>)?.Producer ?? null,
+        creationDate: (data.info as Record<string, unknown>)?.CreationDate ?? null,
+        characterCount: data.text.length,
+      },
+    };
+  }
+}