feat: Enhance PDF parsing with fallback to pdf-lib for improved handling of complex PDFs
Test Suite / test (24.x) (push) Has been cancelled
Test Suite / test (24.x) (push) Has been cancelled
Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Generated
+43
@@ -14,6 +14,7 @@
|
||||
"class-transformer": "^0.5.1",
|
||||
"class-validator": "^0.15.1",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"pdf-lib": "^1.17.1",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"reflect-metadata": "^0.2.2",
|
||||
"rxjs": "^7.8.1",
|
||||
@@ -648,6 +649,24 @@
|
||||
"npm": ">=5.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@pdf-lib/standard-fonts": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz",
|
||||
"integrity": "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pako": "^1.0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/@pdf-lib/upng": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@pdf-lib/upng/-/upng-1.0.1.tgz",
|
||||
"integrity": "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pako": "^1.0.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@pkgjs/parseargs": {
|
||||
"version": "0.11.0",
|
||||
"resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
|
||||
@@ -3351,6 +3370,12 @@
|
||||
"dev": true,
|
||||
"license": "BlueOak-1.0.0"
|
||||
},
|
||||
"node_modules/pako": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
||||
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==",
|
||||
"license": "(MIT AND Zlib)"
|
||||
},
|
||||
"node_modules/parent-module": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
|
||||
@@ -3435,6 +3460,24 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/pdf-lib": {
|
||||
"version": "1.17.1",
|
||||
"resolved": "https://registry.npmjs.org/pdf-lib/-/pdf-lib-1.17.1.tgz",
|
||||
"integrity": "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@pdf-lib/standard-fonts": "^1.0.0",
|
||||
"@pdf-lib/upng": "^1.0.1",
|
||||
"pako": "^1.0.11",
|
||||
"tslib": "^1.11.1"
|
||||
}
|
||||
},
|
||||
"node_modules/pdf-lib/node_modules/tslib": {
|
||||
"version": "1.14.1",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz",
|
||||
"integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==",
|
||||
"license": "0BSD"
|
||||
},
|
||||
"node_modules/pdf-parse": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz",
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
"class-transformer": "^0.5.1",
|
||||
"class-validator": "^0.15.1",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"pdf-lib": "^1.17.1",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"reflect-metadata": "^0.2.2",
|
||||
"rxjs": "^7.8.1",
|
||||
|
||||
@@ -282,11 +282,21 @@ export class ReceiptParsingService {
|
||||
private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise<ParsedReceiptItemRaw[]> {
|
||||
let text: string;
|
||||
try {
|
||||
// Try pdf-parse first
|
||||
const data = await pdfParse(buffer);
|
||||
text = data.text;
|
||||
} catch (err) {
|
||||
this.logger.warn(`pdf-parse misslyckades: ${err}`);
|
||||
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.');
|
||||
|
||||
// Fallback to pdf-lib for more complex PDFs
|
||||
try {
|
||||
const { PDFDocument } = await import('pdf-lib');
|
||||
const pdfDoc = await PDFDocument.load(buffer);
|
||||
text = pdfDoc.getPages().map(page => page.getText()).join(' ');
|
||||
} catch (fallbackErr) {
|
||||
this.logger.error(`Både pdf-parse och pdf-lib misslyckades: ${fallbackErr}`);
|
||||
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.');
|
||||
}
|
||||
}
|
||||
|
||||
const combinedLines = preprocessPdfLines(text);
|
||||
|
||||
Reference in New Issue
Block a user