feat: Enhance PDF parsing with fallback to pdf-lib for improved handling of complex PDFs
Test Suite / test (24.x) (push) Has been cancelled

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Nils-Johan Gynther
2026-05-03 20:03:10 +02:00
parent 723730fd2f
commit 99343f74af
3 changed files with 55 additions and 1 deletions
+43
View File
@@ -14,6 +14,7 @@
"class-transformer": "^0.5.1", "class-transformer": "^0.5.1",
"class-validator": "^0.15.1", "class-validator": "^0.15.1",
"multer": "^1.4.5-lts.1", "multer": "^1.4.5-lts.1",
"pdf-lib": "^1.17.1",
"pdf-parse": "^1.1.1", "pdf-parse": "^1.1.1",
"reflect-metadata": "^0.2.2", "reflect-metadata": "^0.2.2",
"rxjs": "^7.8.1", "rxjs": "^7.8.1",
@@ -648,6 +649,24 @@
"npm": ">=5.0.0" "npm": ">=5.0.0"
} }
}, },
"node_modules/@pdf-lib/standard-fonts": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz",
"integrity": "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==",
"license": "MIT",
"dependencies": {
"pako": "^1.0.6"
}
},
"node_modules/@pdf-lib/upng": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@pdf-lib/upng/-/upng-1.0.1.tgz",
"integrity": "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==",
"license": "MIT",
"dependencies": {
"pako": "^1.0.10"
}
},
"node_modules/@pkgjs/parseargs": { "node_modules/@pkgjs/parseargs": {
"version": "0.11.0", "version": "0.11.0",
"resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
@@ -3351,6 +3370,12 @@
"dev": true, "dev": true,
"license": "BlueOak-1.0.0" "license": "BlueOak-1.0.0"
}, },
"node_modules/pako": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==",
"license": "(MIT AND Zlib)"
},
"node_modules/parent-module": { "node_modules/parent-module": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
@@ -3435,6 +3460,24 @@
"node": ">=8" "node": ">=8"
} }
}, },
"node_modules/pdf-lib": {
"version": "1.17.1",
"resolved": "https://registry.npmjs.org/pdf-lib/-/pdf-lib-1.17.1.tgz",
"integrity": "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==",
"license": "MIT",
"dependencies": {
"@pdf-lib/standard-fonts": "^1.0.0",
"@pdf-lib/upng": "^1.0.1",
"pako": "^1.0.11",
"tslib": "^1.11.1"
}
},
"node_modules/pdf-lib/node_modules/tslib": {
"version": "1.14.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz",
"integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==",
"license": "0BSD"
},
"node_modules/pdf-parse": { "node_modules/pdf-parse": {
"version": "1.1.4", "version": "1.1.4",
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz", "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz",
+1
View File
@@ -14,6 +14,7 @@
"class-transformer": "^0.5.1", "class-transformer": "^0.5.1",
"class-validator": "^0.15.1", "class-validator": "^0.15.1",
"multer": "^1.4.5-lts.1", "multer": "^1.4.5-lts.1",
"pdf-lib": "^1.17.1",
"pdf-parse": "^1.1.1", "pdf-parse": "^1.1.1",
"reflect-metadata": "^0.2.2", "reflect-metadata": "^0.2.2",
"rxjs": "^7.8.1", "rxjs": "^7.8.1",
@@ -282,11 +282,21 @@ export class ReceiptParsingService {
private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise<ParsedReceiptItemRaw[]> { private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise<ParsedReceiptItemRaw[]> {
let text: string; let text: string;
try { try {
// Try pdf-parse first
const data = await pdfParse(buffer); const data = await pdfParse(buffer);
text = data.text; text = data.text;
} catch (err) { } catch (err) {
this.logger.warn(`pdf-parse misslyckades: ${err}`); this.logger.warn(`pdf-parse misslyckades: ${err}`);
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.');
// Fallback to pdf-lib for more complex PDFs
try {
const { PDFDocument } = await import('pdf-lib');
const pdfDoc = await PDFDocument.load(buffer);
text = pdfDoc.getPages().map(page => page.getText()).join(' ');
} catch (fallbackErr) {
this.logger.error(`Både pdf-parse och pdf-lib misslyckades: ${fallbackErr}`);
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.');
}
} }
const combinedLines = preprocessPdfLines(text); const combinedLines = preprocessPdfLines(text);