feat: Enhance PDF parsing with fallback to pdf-lib for improved handling of complex PDFs
Test Suite / test (24.x) (push) Has been cancelled
Test Suite / test (24.x) (push) Has been cancelled
Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Generated
+43
@@ -14,6 +14,7 @@
|
|||||||
"class-transformer": "^0.5.1",
|
"class-transformer": "^0.5.1",
|
||||||
"class-validator": "^0.15.1",
|
"class-validator": "^0.15.1",
|
||||||
"multer": "^1.4.5-lts.1",
|
"multer": "^1.4.5-lts.1",
|
||||||
|
"pdf-lib": "^1.17.1",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
"reflect-metadata": "^0.2.2",
|
"reflect-metadata": "^0.2.2",
|
||||||
"rxjs": "^7.8.1",
|
"rxjs": "^7.8.1",
|
||||||
@@ -648,6 +649,24 @@
|
|||||||
"npm": ">=5.0.0"
|
"npm": ">=5.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@pdf-lib/standard-fonts": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"pako": "^1.0.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@pdf-lib/upng": {
|
||||||
|
"version": "1.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@pdf-lib/upng/-/upng-1.0.1.tgz",
|
||||||
|
"integrity": "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"pako": "^1.0.10"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@pkgjs/parseargs": {
|
"node_modules/@pkgjs/parseargs": {
|
||||||
"version": "0.11.0",
|
"version": "0.11.0",
|
||||||
"resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
|
"resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
|
||||||
@@ -3351,6 +3370,12 @@
|
|||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "BlueOak-1.0.0"
|
"license": "BlueOak-1.0.0"
|
||||||
},
|
},
|
||||||
|
"node_modules/pako": {
|
||||||
|
"version": "1.0.11",
|
||||||
|
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
||||||
|
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==",
|
||||||
|
"license": "(MIT AND Zlib)"
|
||||||
|
},
|
||||||
"node_modules/parent-module": {
|
"node_modules/parent-module": {
|
||||||
"version": "1.0.1",
|
"version": "1.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
|
||||||
@@ -3435,6 +3460,24 @@
|
|||||||
"node": ">=8"
|
"node": ">=8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/pdf-lib": {
|
||||||
|
"version": "1.17.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/pdf-lib/-/pdf-lib-1.17.1.tgz",
|
||||||
|
"integrity": "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"@pdf-lib/standard-fonts": "^1.0.0",
|
||||||
|
"@pdf-lib/upng": "^1.0.1",
|
||||||
|
"pako": "^1.0.11",
|
||||||
|
"tslib": "^1.11.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/pdf-lib/node_modules/tslib": {
|
||||||
|
"version": "1.14.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz",
|
||||||
|
"integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==",
|
||||||
|
"license": "0BSD"
|
||||||
|
},
|
||||||
"node_modules/pdf-parse": {
|
"node_modules/pdf-parse": {
|
||||||
"version": "1.1.4",
|
"version": "1.1.4",
|
||||||
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz",
|
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz",
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
"class-transformer": "^0.5.1",
|
"class-transformer": "^0.5.1",
|
||||||
"class-validator": "^0.15.1",
|
"class-validator": "^0.15.1",
|
||||||
"multer": "^1.4.5-lts.1",
|
"multer": "^1.4.5-lts.1",
|
||||||
|
"pdf-lib": "^1.17.1",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
"reflect-metadata": "^0.2.2",
|
"reflect-metadata": "^0.2.2",
|
||||||
"rxjs": "^7.8.1",
|
"rxjs": "^7.8.1",
|
||||||
|
|||||||
@@ -282,11 +282,21 @@ export class ReceiptParsingService {
|
|||||||
private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise<ParsedReceiptItemRaw[]> {
|
private async parseReceiptFromPdf(buffer: Buffer, apiKey: string): Promise<ParsedReceiptItemRaw[]> {
|
||||||
let text: string;
|
let text: string;
|
||||||
try {
|
try {
|
||||||
|
// Try pdf-parse first
|
||||||
const data = await pdfParse(buffer);
|
const data = await pdfParse(buffer);
|
||||||
text = data.text;
|
text = data.text;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`pdf-parse misslyckades: ${err}`);
|
this.logger.warn(`pdf-parse misslyckades: ${err}`);
|
||||||
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad.');
|
|
||||||
|
// Fallback to pdf-lib for more complex PDFs
|
||||||
|
try {
|
||||||
|
const { PDFDocument } = await import('pdf-lib');
|
||||||
|
const pdfDoc = await PDFDocument.load(buffer);
|
||||||
|
text = pdfDoc.getPages().map(page => page.getText()).join(' ');
|
||||||
|
} catch (fallbackErr) {
|
||||||
|
this.logger.error(`Både pdf-parse och pdf-lib misslyckades: ${fallbackErr}`);
|
||||||
|
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const combinedLines = preprocessPdfLines(text);
|
const combinedLines = preprocessPdfLines(text);
|
||||||
|
|||||||
Reference in New Issue
Block a user