From 99343f74af5068f80ad6e0b353c9911c81f112e4 Mon Sep 17 00:00:00 2001 From: Nils-Johan Gynther Date: Sun, 3 May 2026 20:03:10 +0200 Subject: [PATCH] feat: Enhance PDF parsing with fallback to pdf-lib for improved handling of complex PDFs Co-authored-by: Copilot --- backend/package-lock.json | 43 +++++++++++++++++++ backend/package.json | 1 + .../receipt-parsing.service.ts | 12 +++++- 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/backend/package-lock.json b/backend/package-lock.json index 25fbcf3..426ef1f 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -14,6 +14,7 @@ "class-transformer": "^0.5.1", "class-validator": "^0.15.1", "multer": "^1.4.5-lts.1", + "pdf-lib": "^1.17.1", "pdf-parse": "^1.1.1", "reflect-metadata": "^0.2.2", "rxjs": "^7.8.1", @@ -648,6 +649,24 @@ "npm": ">=5.0.0" } }, + "node_modules/@pdf-lib/standard-fonts": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz", + "integrity": "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==", + "license": "MIT", + "dependencies": { + "pako": "^1.0.6" + } + }, + "node_modules/@pdf-lib/upng": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@pdf-lib/upng/-/upng-1.0.1.tgz", + "integrity": "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==", + "license": "MIT", + "dependencies": { + "pako": "^1.0.10" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -3351,6 +3370,12 @@ "dev": true, "license": "BlueOak-1.0.0" }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "license": "(MIT AND Zlib)" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -3435,6 +3460,24 @@ "node": ">=8" } }, + "node_modules/pdf-lib": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/pdf-lib/-/pdf-lib-1.17.1.tgz", + "integrity": "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==", + "license": "MIT", + "dependencies": { + "@pdf-lib/standard-fonts": "^1.0.0", + "@pdf-lib/upng": "^1.0.1", + "pako": "^1.0.11", + "tslib": "^1.11.1" + } + }, + "node_modules/pdf-lib/node_modules/tslib": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==", + "license": "0BSD" + }, "node_modules/pdf-parse": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz", diff --git a/backend/package.json b/backend/package.json index c600b97..2076b4a 100644 --- a/backend/package.json +++ b/backend/package.json @@ -14,6 +14,7 @@ "class-transformer": "^0.5.1", "class-validator": "^0.15.1", "multer": "^1.4.5-lts.1", + "pdf-lib": "^1.17.1", "pdf-parse": "^1.1.1", "reflect-metadata": "^0.2.2", "rxjs": "^7.8.1", diff --git a/backend/src/receipt-parsing/receipt-parsing.service.ts b/backend/src/receipt-parsing/receipt-parsing.service.ts index fbd4801..3022893 100644 --- a/backend/src/receipt-parsing/receipt-parsing.service.ts +++ b/backend/src/receipt-parsing/receipt-parsing.service.ts @@ -282,11 +282,21 @@ export class ReceiptParsingService { private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise { let text: string; try { + // Try pdf-parse first const data = await pdfParse(buffer); text = data.text; } catch (err) { this.logger.warn(`pdf-parse misslyckades: ${err}`); - throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.'); + + // Fallback to pdf-lib for more complex PDFs + try { + const { PDFDocument } = await import('pdf-lib'); + const pdfDoc = await PDFDocument.load(buffer); + text = pdfDoc.getPages().map(page => page.getText()).join(' '); + } catch (fallbackErr) { + this.logger.error(`Både pdf-parse och pdf-lib misslyckades: ${fallbackErr}`); + throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.'); + } } const combinedLines = preprocessPdfLines(text);