feat: Integrate pdfjs-dist for improved PDF parsing fallback
Test Suite / test (24.x) (push) Has been cancelled
Test Suite / test (24.x) (push) Has been cancelled
Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Generated
+278
@@ -16,6 +16,7 @@
|
|||||||
"multer": "^1.4.5-lts.1",
|
"multer": "^1.4.5-lts.1",
|
||||||
"pdf-lib": "^1.17.1",
|
"pdf-lib": "^1.17.1",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
|
"pdfjs-dist": "^5.7.284",
|
||||||
"reflect-metadata": "^0.2.2",
|
"reflect-metadata": "^0.2.2",
|
||||||
"rxjs": "^7.8.1",
|
"rxjs": "^7.8.1",
|
||||||
"tesseract.js": "^5.1.1"
|
"tesseract.js": "^5.1.1"
|
||||||
@@ -411,6 +412,271 @@
|
|||||||
"node": ">=8"
|
"node": ">=8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@napi-rs/canvas": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-xglYA6q3XO5P3BNJYxVZ1IV7DLVjp1Py6nwag88YntrS+3vKHyYcMqXVS4ZztJmwz2uGvz1FWhI/4LgbR5uQDA==",
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"workspaces": [
|
||||||
|
"e2e/*"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@napi-rs/canvas-android-arm64": "0.1.100",
|
||||||
|
"@napi-rs/canvas-darwin-arm64": "0.1.100",
|
||||||
|
"@napi-rs/canvas-darwin-x64": "0.1.100",
|
||||||
|
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.100",
|
||||||
|
"@napi-rs/canvas-linux-arm64-gnu": "0.1.100",
|
||||||
|
"@napi-rs/canvas-linux-arm64-musl": "0.1.100",
|
||||||
|
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.100",
|
||||||
|
"@napi-rs/canvas-linux-x64-gnu": "0.1.100",
|
||||||
|
"@napi-rs/canvas-linux-x64-musl": "0.1.100",
|
||||||
|
"@napi-rs/canvas-win32-arm64-msvc": "0.1.100",
|
||||||
|
"@napi-rs/canvas-win32-x64-msvc": "0.1.100"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-android-arm64": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-hjhCKhntPv9+t4ckHymdx0phYNcVW+GKQR6Lzw2zE+pOVjOplSmtx9nNNknTjbEDLcuLZqA1y8ufKg1XfgftzQ==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"android"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-darwin-arm64": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-2PcswRaC7Ly645DGt88///zuFDhJxJYdKAs1uU3mfk1atYkXufgcgLfBpk6Tm12nCQBaNt1wpybuPZ4qOhTo8A==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-darwin-x64": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-ePNZtj7pNIva/siZMg+HmbeozkIjqUIYdoymH8HaA3qK7LfzFN4WMBM8G6HQ9ZC+H3+Dnn5pqtiXpgLykaPOhw==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-d5cDB48oWFGU8/XPhUOFAlySgb/VAu7D+s8fi55K1Pcfg8aPplHWqMgibhVLU8ky7Pyg/fuiVLz4Nf3JrSTuUA==",
|
||||||
|
"cpu": [
|
||||||
|
"arm"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-rDxgxRu69RvDlX/bh9o22DxLsGr8EqsNgotL9+RwQE1S0b0cqeatqsw6aW45mukm0B42DIAaAacKaYQ8cqS1nw==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"libc": [
|
||||||
|
"glibc"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-K3mDW66N+xT2/V439u1alFANiBUjdEx2gLiNYnCmUsva5jZMxWTjafBYwTzYK+EMFMHrUoabuU+T1BIP5CgbYQ==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"libc": [
|
||||||
|
"musl"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-mooqUBTIsccZpnoQC4NgrC1v6C1vof39etLNMnBwCY+p0gajWJvAHLGQ6g/gGyS5YrpDW+GefSN4+Cvcr08UWw==",
|
||||||
|
"cpu": [
|
||||||
|
"riscv64"
|
||||||
|
],
|
||||||
|
"libc": [
|
||||||
|
"glibc"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-1eCvkDCazm7FFhsT7DfGOdSaHgZVK3bt/dSBl5EWHOWmnz+I7j8tPseJqqD81NF+MH21jKUK4wQSDjN0mdhnTg==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"libc": [
|
||||||
|
"glibc"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-linux-x64-musl": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-20arT6lnI19S68qNlii73TSEDbECNgzMz2EpldC1V3mZFuRkeujXkcebRk0LRJe9SEUAooYiLokfMViY8IX7yA==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"libc": [
|
||||||
|
"musl"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-win32-arm64-msvc": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-DZFFT1wIAg37LJw37yhMRFfjATd3vTQzjZ1Yki8u2vhO6Hi5VE6BVaGQ1aaDu7xb4iMErz+9EOwjpS7xcxFeBw==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"win32"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
|
||||||
|
"version": "0.1.100",
|
||||||
|
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.100.tgz",
|
||||||
|
"integrity": "sha512-MyT1j3mHC2+Lu4pBi9mKyMJhtP6U7k7EldY7sj/uS5gJA65gTXt8MefJQXLJo5d/vZbuWmfxzkEUNc/urV3pHA==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"win32"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@nestjs/cli": {
|
"node_modules/@nestjs/cli": {
|
||||||
"version": "10.4.9",
|
"version": "10.4.9",
|
||||||
"resolved": "https://registry.npmjs.org/@nestjs/cli/-/cli-10.4.9.tgz",
|
"resolved": "https://registry.npmjs.org/@nestjs/cli/-/cli-10.4.9.tgz",
|
||||||
@@ -3494,6 +3760,18 @@
|
|||||||
"url": "https://github.com/sponsors/mehmet-kozan"
|
"url": "https://github.com/sponsors/mehmet-kozan"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/pdfjs-dist": {
|
||||||
|
"version": "5.7.284",
|
||||||
|
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.7.284.tgz",
|
||||||
|
"integrity": "sha512-h4EdYQczmGhbOlqc3PPZwxevn7ApdWPbovAuWXOB/DjIyigSnwfy2oze7c6mRcSr9XgLp3eN3EeL4DyySTPMFw==",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=22.13.0 || >=24"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@napi-rs/canvas": "^0.1.100"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/picocolors": {
|
"node_modules/picocolors": {
|
||||||
"version": "1.1.1",
|
"version": "1.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
"multer": "^1.4.5-lts.1",
|
"multer": "^1.4.5-lts.1",
|
||||||
"pdf-lib": "^1.17.1",
|
"pdf-lib": "^1.17.1",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
|
"pdfjs-dist": "^5.7.284",
|
||||||
"reflect-metadata": "^0.2.2",
|
"reflect-metadata": "^0.2.2",
|
||||||
"rxjs": "^7.8.1",
|
"rxjs": "^7.8.1",
|
||||||
"tesseract.js": "^5.1.1"
|
"tesseract.js": "^5.1.1"
|
||||||
|
|||||||
@@ -288,13 +288,23 @@ export class ReceiptParsingService {
|
|||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`pdf-parse misslyckades: ${err}`);
|
this.logger.warn(`pdf-parse misslyckades: ${err}`);
|
||||||
|
|
||||||
// Fallback to pdf-lib for more complex PDFs
|
// Fallback to pdfjs-dist for more complex PDFs
|
||||||
try {
|
try {
|
||||||
const { PDFDocument } = await import('pdf-lib');
|
const pdfjsLib = await import('pdfjs-dist');
|
||||||
const pdfDoc = await PDFDocument.load(buffer);
|
const loadingTask = pdfjsLib.getDocument({ data: buffer });
|
||||||
text = pdfDoc.getPages().map(page => page.getText()).join(' ');
|
const pdfDocument = await loadingTask.promise;
|
||||||
|
|
||||||
|
let fullText = '';
|
||||||
|
for (let i = 1; i <= pdfDocument.numPages; i++) {
|
||||||
|
const page = await pdfDocument.getPage(i);
|
||||||
|
const textContent = await page.getTextContent();
|
||||||
|
const pageText = textContent.items.map(item => item.str).join(' ');
|
||||||
|
fullText += pageText + ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
text = fullText;
|
||||||
} catch (fallbackErr) {
|
} catch (fallbackErr) {
|
||||||
this.logger.error(`Både pdf-parse och pdf-lib misslyckades: ${fallbackErr}`);
|
this.logger.error(`Både pdf-parse och pdfjs-dist misslyckades: ${fallbackErr}`);
|
||||||
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.');
|
throw new BadRequestException('PDF-filen kunde inte läsas. Kontrollera att filen inte är skadad eller krypterad.');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user