feat: Implement PDF document import functionality with Markdown conversion

- Added DocumentImportModule, DocumentImportController, and DocumentImportService for handling PDF uploads.
- Integrated pdf-parse for extracting text from PDF files.
- Created PdfParser for parsing PDF documents and converting them to Markdown format.
- Updated frontend to support file uploads via drag-and-drop and file input for PDF documents.
- Modified API routes to handle document import requests.
- Enhanced error handling for unsupported file types and file size limits.
- Updated README to reflect new features and usage instructions.
This commit is contained in:
Nils-Johan Gynther
2026-04-12 18:57:40 +02:00
parent a1a4f9beb3
commit e18bf79395
10 changed files with 538 additions and 290 deletions
+2
View File
@@ -1,9 +1,11 @@
import { Module } from '@nestjs/common';
import { QuickImportModule } from './quick-import/quick-import.module';
import { RecipesModule } from './recipes/recipes.module';
import { DocumentImportModule } from './document-import/document-import.module';
@Module({
imports: [
DocumentImportModule,
QuickImportModule,
RecipesModule,
],
@@ -0,0 +1,37 @@
import {
Controller,
Post,
UploadedFile,
UseInterceptors,
BadRequestException,
} from '@nestjs/common';
import { FileInterceptor } from '@nestjs/platform-express';
import { DocumentImportService, DocumentImportResult } from './document-import.service';
@Controller('api/document-import')
export class DocumentImportController {
constructor(private readonly documentImportService: DocumentImportService) {}
/**
* POST /api/document-import
* Ladda upp en PDF-fil och konvertera till Markdown
* Förväntar multipart/form-data med fältet "file"
*/
@Post()
@UseInterceptors(
FileInterceptor('file', {
limits: { fileSize: 50 * 1024 * 1024 }, // 50 MB gräns på multer-nivå
})
)
async importDocument(
@UploadedFile() file: Express.Multer.File
): Promise<DocumentImportResult> {
if (!file) {
throw new BadRequestException(
'Ingen fil mottagen. Skicka en PDF-fil med fältet "file" i multipart/form-data.'
);
}
return this.documentImportService.importFromFile(file);
}
}
@@ -0,0 +1,9 @@
import { Module } from '@nestjs/common';
import { DocumentImportController } from './document-import.controller';
import { DocumentImportService } from './document-import.service';
@Module({
controllers: [DocumentImportController],
providers: [DocumentImportService],
})
export class DocumentImportModule {}
@@ -0,0 +1,55 @@
import { Injectable, BadRequestException } from '@nestjs/common';
import { PdfParser } from './parsers/pdf.parser';
export interface DocumentImportResult {
markdown: string;
title: string;
documentType: 'pdf';
metadata?: Record<string, unknown>;
}
const MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024; // 50 MB
const ALLOWED_MIME_TYPES = ['application/pdf'];
@Injectable()
export class DocumentImportService {
private readonly pdfParser = new PdfParser();
async importFromFile(file: Express.Multer.File): Promise<DocumentImportResult> {
console.log(
'[DocumentImport] Mottog fil:',
file.originalname,
'— Typ:',
file.mimetype,
'— Storlek:',
file.size,
'bytes'
);
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
throw new BadRequestException(
`Filtypen "${file.mimetype}" stöds inte. Endast PDF-filer accepteras för tillfället.`
);
}
if (file.size > MAX_FILE_SIZE_BYTES) {
throw new BadRequestException(
`Filen är för stor (${Math.round(file.size / 1024 / 1024)} MB). Maximal filstorlek är 50 MB.`
);
}
try {
const parsed = await this.pdfParser.parse(file.buffer, file.originalname);
return {
markdown: parsed.content,
title: parsed.title,
documentType: 'pdf',
metadata: parsed.metadata,
};
} catch (error) {
const message = error instanceof Error ? error.message : 'Okänt fel vid parsning';
console.error('[DocumentImport] Parse-fel för', file.originalname, ':', message);
throw new BadRequestException(`Kunde inte läsa dokumentet: ${message}`);
}
}
}
@@ -0,0 +1,44 @@
/**
* Abstract bas för document parsers
* Alla dokumenttyp-specifika parsers bör extenda denna
*/
export interface ParsedDocument {
title: string;
content: string; // Markdown-format
metadata?: Record<string, unknown>;
}
export abstract class DocumentParser {
/**
* Parsa document-buffer och returnera strukturerad data
*/
abstract parse(buffer: Buffer, filename: string): Promise<ParsedDocument>;
/**
* Konverterar fritext till Markdown
* Slår ihop sammanhängande textrader, bevarar stycken
*/
protected textToMarkdown(text: string, title: string): string {
const lines = text.split('\n').map(l => l.trim());
const paragraphs: string[] = [];
let currentParagraph: string[] = [];
for (const line of lines) {
if (line.length === 0) {
if (currentParagraph.length > 0) {
paragraphs.push(currentParagraph.join(' '));
currentParagraph = [];
}
} else {
currentParagraph.push(line);
}
}
if (currentParagraph.length > 0) {
paragraphs.push(currentParagraph.join(' '));
}
const body = paragraphs.filter(p => p.length > 0).join('\n\n');
return `# ${title}\n\n${body}`;
}
}
@@ -0,0 +1,48 @@
import * as pdfParse from 'pdf-parse';
import { DocumentParser, ParsedDocument } from './document.parser';
export class PdfParser extends DocumentParser {
async parse(buffer: Buffer, filename: string): Promise<ParsedDocument> {
console.log('[PdfParser] Parsing:', filename, '— Storlek:', buffer.length, 'bytes');
let data: Awaited<ReturnType<typeof pdfParse>>;
try {
data = await pdfParse(buffer);
} catch (err) {
// Lösenordsskyddade eller skadade PDFs
const message = err instanceof Error ? err.message : String(err);
if (message.toLowerCase().includes('password')) {
throw new Error('PDF-filen är lösenordsskyddad och kan inte läsas');
}
throw new Error(`Kunde inte läsa PDF: ${message}`);
}
const hasText = data.text && data.text.trim().length > 20;
if (!hasText) {
// Textextraction gav ingenting — troligtvis en skannad bild-PDF
throw new Error(
'PDFen verkar vara en skannad bild utan textlager. OCR-stöd kommer i nästa version.'
);
}
console.log(
`[PdfParser] Extraherade ${data.numpages} sidor, ${data.text.length} tecken från ${filename}`
);
const title = filename.replace(/\.pdf$/i, '').replace(/[_-]+/g, ' ').trim();
const markdown = this.textToMarkdown(data.text, title);
return {
title,
content: markdown,
metadata: {
pageCount: data.numpages,
producer: (data.info as Record<string, unknown>)?.Producer ?? null,
creationDate: (data.info as Record<string, unknown>)?.CreationDate ?? null,
characterCount: data.text.length,
},
};
}
}