feat: Implement PDF document import functionality with Markdown conversion
- Added DocumentImportModule, DocumentImportController, and DocumentImportService for handling PDF uploads. - Integrated pdf-parse for extracting text from PDF files. - Created PdfParser for parsing PDF documents and converting them to Markdown format. - Updated frontend to support file uploads via drag-and-drop and file input for PDF documents. - Modified API routes to handle document import requests. - Enhanced error handling for unsupported file types and file size limits. - Updated README to reflect new features and usage instructions.
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
import { Module } from '@nestjs/common';
|
||||
import { QuickImportModule } from './quick-import/quick-import.module';
|
||||
import { RecipesModule } from './recipes/recipes.module';
|
||||
import { DocumentImportModule } from './document-import/document-import.module';
|
||||
|
||||
@Module({
|
||||
imports: [
|
||||
DocumentImportModule,
|
||||
QuickImportModule,
|
||||
RecipesModule,
|
||||
],
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import {
|
||||
Controller,
|
||||
Post,
|
||||
UploadedFile,
|
||||
UseInterceptors,
|
||||
BadRequestException,
|
||||
} from '@nestjs/common';
|
||||
import { FileInterceptor } from '@nestjs/platform-express';
|
||||
import { DocumentImportService, DocumentImportResult } from './document-import.service';
|
||||
|
||||
@Controller('api/document-import')
|
||||
export class DocumentImportController {
|
||||
constructor(private readonly documentImportService: DocumentImportService) {}
|
||||
|
||||
/**
|
||||
* POST /api/document-import
|
||||
* Ladda upp en PDF-fil och konvertera till Markdown
|
||||
* Förväntar multipart/form-data med fältet "file"
|
||||
*/
|
||||
@Post()
|
||||
@UseInterceptors(
|
||||
FileInterceptor('file', {
|
||||
limits: { fileSize: 50 * 1024 * 1024 }, // 50 MB gräns på multer-nivå
|
||||
})
|
||||
)
|
||||
async importDocument(
|
||||
@UploadedFile() file: Express.Multer.File
|
||||
): Promise<DocumentImportResult> {
|
||||
if (!file) {
|
||||
throw new BadRequestException(
|
||||
'Ingen fil mottagen. Skicka en PDF-fil med fältet "file" i multipart/form-data.'
|
||||
);
|
||||
}
|
||||
|
||||
return this.documentImportService.importFromFile(file);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
import { Module } from '@nestjs/common';
|
||||
import { DocumentImportController } from './document-import.controller';
|
||||
import { DocumentImportService } from './document-import.service';
|
||||
|
||||
@Module({
|
||||
controllers: [DocumentImportController],
|
||||
providers: [DocumentImportService],
|
||||
})
|
||||
export class DocumentImportModule {}
|
||||
@@ -0,0 +1,55 @@
|
||||
import { Injectable, BadRequestException } from '@nestjs/common';
|
||||
import { PdfParser } from './parsers/pdf.parser';
|
||||
|
||||
export interface DocumentImportResult {
|
||||
markdown: string;
|
||||
title: string;
|
||||
documentType: 'pdf';
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
const MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024; // 50 MB
|
||||
const ALLOWED_MIME_TYPES = ['application/pdf'];
|
||||
|
||||
@Injectable()
|
||||
export class DocumentImportService {
|
||||
private readonly pdfParser = new PdfParser();
|
||||
|
||||
async importFromFile(file: Express.Multer.File): Promise<DocumentImportResult> {
|
||||
console.log(
|
||||
'[DocumentImport] Mottog fil:',
|
||||
file.originalname,
|
||||
'— Typ:',
|
||||
file.mimetype,
|
||||
'— Storlek:',
|
||||
file.size,
|
||||
'bytes'
|
||||
);
|
||||
|
||||
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
|
||||
throw new BadRequestException(
|
||||
`Filtypen "${file.mimetype}" stöds inte. Endast PDF-filer accepteras för tillfället.`
|
||||
);
|
||||
}
|
||||
|
||||
if (file.size > MAX_FILE_SIZE_BYTES) {
|
||||
throw new BadRequestException(
|
||||
`Filen är för stor (${Math.round(file.size / 1024 / 1024)} MB). Maximal filstorlek är 50 MB.`
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = await this.pdfParser.parse(file.buffer, file.originalname);
|
||||
return {
|
||||
markdown: parsed.content,
|
||||
title: parsed.title,
|
||||
documentType: 'pdf',
|
||||
metadata: parsed.metadata,
|
||||
};
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : 'Okänt fel vid parsning';
|
||||
console.error('[DocumentImport] Parse-fel för', file.originalname, ':', message);
|
||||
throw new BadRequestException(`Kunde inte läsa dokumentet: ${message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/**
|
||||
* Abstract bas för document parsers
|
||||
* Alla dokumenttyp-specifika parsers bör extenda denna
|
||||
*/
|
||||
export interface ParsedDocument {
|
||||
title: string;
|
||||
content: string; // Markdown-format
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export abstract class DocumentParser {
|
||||
/**
|
||||
* Parsa document-buffer och returnera strukturerad data
|
||||
*/
|
||||
abstract parse(buffer: Buffer, filename: string): Promise<ParsedDocument>;
|
||||
|
||||
/**
|
||||
* Konverterar fritext till Markdown
|
||||
* Slår ihop sammanhängande textrader, bevarar stycken
|
||||
*/
|
||||
protected textToMarkdown(text: string, title: string): string {
|
||||
const lines = text.split('\n').map(l => l.trim());
|
||||
const paragraphs: string[] = [];
|
||||
let currentParagraph: string[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.length === 0) {
|
||||
if (currentParagraph.length > 0) {
|
||||
paragraphs.push(currentParagraph.join(' '));
|
||||
currentParagraph = [];
|
||||
}
|
||||
} else {
|
||||
currentParagraph.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (currentParagraph.length > 0) {
|
||||
paragraphs.push(currentParagraph.join(' '));
|
||||
}
|
||||
|
||||
const body = paragraphs.filter(p => p.length > 0).join('\n\n');
|
||||
return `# ${title}\n\n${body}`;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
import * as pdfParse from 'pdf-parse';
|
||||
import { DocumentParser, ParsedDocument } from './document.parser';
|
||||
|
||||
export class PdfParser extends DocumentParser {
|
||||
async parse(buffer: Buffer, filename: string): Promise<ParsedDocument> {
|
||||
console.log('[PdfParser] Parsing:', filename, '— Storlek:', buffer.length, 'bytes');
|
||||
|
||||
let data: Awaited<ReturnType<typeof pdfParse>>;
|
||||
|
||||
try {
|
||||
data = await pdfParse(buffer);
|
||||
} catch (err) {
|
||||
// Lösenordsskyddade eller skadade PDFs
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
if (message.toLowerCase().includes('password')) {
|
||||
throw new Error('PDF-filen är lösenordsskyddad och kan inte läsas');
|
||||
}
|
||||
throw new Error(`Kunde inte läsa PDF: ${message}`);
|
||||
}
|
||||
|
||||
const hasText = data.text && data.text.trim().length > 20;
|
||||
|
||||
if (!hasText) {
|
||||
// Textextraction gav ingenting — troligtvis en skannad bild-PDF
|
||||
throw new Error(
|
||||
'PDFen verkar vara en skannad bild utan textlager. OCR-stöd kommer i nästa version.'
|
||||
);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[PdfParser] Extraherade ${data.numpages} sidor, ${data.text.length} tecken från ${filename}`
|
||||
);
|
||||
|
||||
const title = filename.replace(/\.pdf$/i, '').replace(/[_-]+/g, ' ').trim();
|
||||
const markdown = this.textToMarkdown(data.text, title);
|
||||
|
||||
return {
|
||||
title,
|
||||
content: markdown,
|
||||
metadata: {
|
||||
pageCount: data.numpages,
|
||||
producer: (data.info as Record<string, unknown>)?.Producer ?? null,
|
||||
creationDate: (data.info as Record<string, unknown>)?.CreationDate ?? null,
|
||||
characterCount: data.text.length,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user