feat(flyer-import): integrate AI-based flyer parsing with image support
- Add support for PNG, JPEG, and WebP image formats in flyer import - Replace external importer service with internal AI-based parsing pipeline - Add new services: TextExtractorService, AiFlyerParserService, FlyerNormalizerService - Integrate Mistral AI, pdf-parse, and tesseract.js dependencies - Add quality confidence indicators and warning panels in Flutter UI - Update package.json with new dependencies and transform ignore patterns - Add documentation for flyer importer system - Add Kilo AI planning file for Happy Island project BREAKING CHANGE: Flyer import now uses internal AI parsing instead of external importer service
This commit is contained in:
Generated
+166
@@ -8,6 +8,7 @@
|
||||
"name": "recipe-api",
|
||||
"version": "0.0.1",
|
||||
"dependencies": {
|
||||
"@mistralai/mistralai": "^0.5.0",
|
||||
"@nestjs/common": "^11.1.19",
|
||||
"@nestjs/core": "^11.1.19",
|
||||
"@nestjs/jwt": "^11.0.2",
|
||||
@@ -22,10 +23,12 @@
|
||||
"multer": "^2.1.1",
|
||||
"passport": "^0.7.0",
|
||||
"passport-jwt": "^4.0.1",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"prisma": "6.12.0",
|
||||
"reflect-metadata": "^0.2.2",
|
||||
"rxjs": "^7.8.1",
|
||||
"sharp": "^0.33.5",
|
||||
"tesseract.js": "^5.1.1",
|
||||
"uuid": "^11.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
@@ -38,6 +41,7 @@
|
||||
"@types/multer": "^1.4.12",
|
||||
"@types/node": "^22.15.29",
|
||||
"@types/passport-jwt": "^4.0.1",
|
||||
"@types/pdf-parse": "^1.1.5",
|
||||
"@types/supertest": "^7.2.0",
|
||||
"@types/uuid": "^10.0.0",
|
||||
"@typescript-eslint/eslint-plugin": "^8.46.2",
|
||||
@@ -2145,6 +2149,15 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/@mistralai/mistralai": {
|
||||
"version": "0.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@mistralai/mistralai/-/mistralai-0.5.0.tgz",
|
||||
"integrity": "sha512-56xfoC/0CiT0RFHrRNoJYSKCNc922EyHzEPJYY6ttalQ5KZdrNVgXeOetIGX0lDx7IjbxAJrrae2MQgUIlL9+g==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"node-fetch": "^2.6.7"
|
||||
}
|
||||
},
|
||||
"node_modules/@nestjs/cli": {
|
||||
"version": "11.0.21",
|
||||
"resolved": "https://registry.npmjs.org/@nestjs/cli/-/cli-11.0.21.tgz",
|
||||
@@ -2810,6 +2823,16 @@
|
||||
"@types/passport": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/pdf-parse": {
|
||||
"version": "1.1.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/pdf-parse/-/pdf-parse-1.1.5.tgz",
|
||||
"integrity": "sha512-kBfrSXsloMnUJOKi25s3+hRmkycHfLK6A09eRGqF/N8BkQoPUmaCr+q8Cli5FnfohEz/rsv82zAiPz/LXtOGhA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/qs": {
|
||||
"version": "6.15.1",
|
||||
"resolved": "https://registry.npmjs.org/@types/qs/-/qs-6.15.1.tgz",
|
||||
@@ -3795,6 +3818,12 @@
|
||||
"readable-stream": "^3.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bmp-js": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz",
|
||||
"integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/body-parser": {
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz",
|
||||
@@ -5890,6 +5919,12 @@
|
||||
"url": "https://opencollective.com/express"
|
||||
}
|
||||
},
|
||||
"node_modules/idb-keyval": {
|
||||
"version": "6.2.2",
|
||||
"resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.2.tgz",
|
||||
"integrity": "sha512-yjD9nARJ/jb1g+CvD0tlhUHOrJ9Sy0P8T9MF3YaLlHnSRpwPfpTX0XIvpmw3gAJUmEu3FiICLBDPXVwyEvrleg==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/ieee754": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
||||
@@ -6017,6 +6052,12 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/is-electron": {
|
||||
"version": "2.2.2",
|
||||
"resolved": "https://registry.npmjs.org/is-electron/-/is-electron-2.2.2.tgz",
|
||||
"integrity": "sha512-FO/Rhvz5tuw4MCWkpMzHFKWD2LsfHzIb7i6MdPYZ/KW7AlxawyLkqdy+jPZP1WubqEADE3O4FUENlJHDfQASRg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/is-extglob": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
|
||||
@@ -6112,6 +6153,12 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/is-url": {
|
||||
"version": "1.2.4",
|
||||
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
|
||||
"integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/isexe": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
|
||||
@@ -7573,6 +7620,32 @@
|
||||
"lodash": "^4.17.21"
|
||||
}
|
||||
},
|
||||
"node_modules/node-ensure": {
|
||||
"version": "0.0.0",
|
||||
"resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz",
|
||||
"integrity": "sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/node-fetch": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
|
||||
"integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"whatwg-url": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": "4.x || >=6.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"encoding": "^0.1.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"encoding": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/node-int64": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz",
|
||||
@@ -7668,6 +7741,15 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/opencollective-postinstall": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz",
|
||||
"integrity": "sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==",
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"opencollective-postinstall": "index.js"
|
||||
}
|
||||
},
|
||||
"node_modules/optionator": {
|
||||
"version": "0.9.4",
|
||||
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
|
||||
@@ -7921,6 +8003,22 @@
|
||||
"resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz",
|
||||
"integrity": "sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg=="
|
||||
},
|
||||
"node_modules/pdf-parse": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.4.tgz",
|
||||
"integrity": "sha512-XRIRcLgk6ZnUbsHsYXExMw+krrPE81hJ6FQPLdBNhhBefqIQKXu/WeTgNBGSwPrfU0v+UCEwn7AoAUOsVKHFvQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"node-ensure": "^0.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.8.1"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/mehmet-kozan"
|
||||
}
|
||||
},
|
||||
"node_modules/picocolors": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
||||
@@ -8171,6 +8269,12 @@
|
||||
"integrity": "sha512-urBwgfrvVP/eAyXx4hluJivBKzuEbSQs9rKWCrCkbSxNv8mxPcUZKeuoF3Uy4mJl3Lwprp6yy5/39VWigZ4K6Q==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/regenerator-runtime": {
|
||||
"version": "0.13.11",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz",
|
||||
"integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/require-directory": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
|
||||
@@ -9050,6 +9154,31 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tesseract.js": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-5.1.1.tgz",
|
||||
"integrity": "sha512-lzVl/Ar3P3zhpUT31NjqeCo1f+D5+YfpZ5J62eo2S14QNVOmHBTtbchHm/YAbOOOzCegFnKf4B3Qih9LuldcYQ==",
|
||||
"hasInstallScript": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"bmp-js": "^0.1.0",
|
||||
"idb-keyval": "^6.2.0",
|
||||
"is-electron": "^2.2.2",
|
||||
"is-url": "^1.2.4",
|
||||
"node-fetch": "^2.6.9",
|
||||
"opencollective-postinstall": "^2.0.3",
|
||||
"regenerator-runtime": "^0.13.3",
|
||||
"tesseract.js-core": "^5.1.1",
|
||||
"wasm-feature-detect": "^1.2.11",
|
||||
"zlibjs": "^0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/tesseract.js-core": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.1.1.tgz",
|
||||
"integrity": "sha512-KX3bYSU5iGcO1XJa+QGPbi+Zjo2qq6eBhNjSGR5E5q0JtzkoipJKOUQD7ph8kFyteCEfEQ0maWLu8MCXtvX5uQ==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/test-exclude": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz",
|
||||
@@ -9151,6 +9280,12 @@
|
||||
"url": "https://github.com/sponsors/Borewit"
|
||||
}
|
||||
},
|
||||
"node_modules/tr46": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
|
||||
"integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/ts-api-utils": {
|
||||
"version": "2.5.0",
|
||||
"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz",
|
||||
@@ -9537,6 +9672,12 @@
|
||||
"makeerror": "1.0.12"
|
||||
}
|
||||
},
|
||||
"node_modules/wasm-feature-detect": {
|
||||
"version": "1.8.0",
|
||||
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
|
||||
"integrity": "sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/watchpack": {
|
||||
"version": "2.5.1",
|
||||
"resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
|
||||
@@ -9561,6 +9702,12 @@
|
||||
"defaults": "^1.0.3"
|
||||
}
|
||||
},
|
||||
"node_modules/webidl-conversions": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
|
||||
"license": "BSD-2-Clause"
|
||||
},
|
||||
"node_modules/webpack": {
|
||||
"version": "5.106.0",
|
||||
"resolved": "https://registry.npmjs.org/webpack/-/webpack-5.106.0.tgz",
|
||||
@@ -9668,6 +9815,16 @@
|
||||
"url": "https://opencollective.com/webpack"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
"integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tr46": "~0.0.3",
|
||||
"webidl-conversions": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/which": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
|
||||
@@ -9814,6 +9971,15 @@
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/zlibjs": {
|
||||
"version": "0.3.1",
|
||||
"resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
|
||||
"integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
"test:watch": "jest --watch"
|
||||
},
|
||||
"dependencies": {
|
||||
"@mistralai/mistralai": "^0.5.0",
|
||||
"@nestjs/common": "^11.1.19",
|
||||
"@nestjs/core": "^11.1.19",
|
||||
"@nestjs/jwt": "^11.0.2",
|
||||
@@ -32,10 +33,12 @@
|
||||
"multer": "^2.1.1",
|
||||
"passport": "^0.7.0",
|
||||
"passport-jwt": "^4.0.1",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"prisma": "6.12.0",
|
||||
"reflect-metadata": "^0.2.2",
|
||||
"rxjs": "^7.8.1",
|
||||
"sharp": "^0.33.5",
|
||||
"tesseract.js": "^5.1.1",
|
||||
"uuid": "^11.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
@@ -48,6 +51,7 @@
|
||||
"@types/multer": "^1.4.12",
|
||||
"@types/node": "^22.15.29",
|
||||
"@types/passport-jwt": "^4.0.1",
|
||||
"@types/pdf-parse": "^1.1.5",
|
||||
"@types/supertest": "^7.2.0",
|
||||
"@types/uuid": "^10.0.0",
|
||||
"@typescript-eslint/eslint-plugin": "^8.46.2",
|
||||
@@ -67,6 +71,9 @@
|
||||
"js",
|
||||
"json",
|
||||
"ts"
|
||||
],
|
||||
"transformIgnorePatterns": [
|
||||
"node_modules/(@mistralai)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ import { ThrottlerGuard, ThrottlerModule } from '@nestjs/throttler';
|
||||
import { JwtAuthGuard } from './auth/jwt-auth.guard';
|
||||
import { RolesGuard } from './auth/roles.guard';
|
||||
|
||||
describe('App security configuration', () => {
|
||||
describe('App security configuration', () => {
|
||||
function getAppModuleClass() {
|
||||
process.env.JWT_SECRET = process.env.JWT_SECRET ?? 'test-secret';
|
||||
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
||||
|
||||
@@ -18,6 +18,9 @@ const ALLOWED_MIMES = [
|
||||
'application/pdf',
|
||||
'application/octet-stream',
|
||||
'text/plain',
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/webp',
|
||||
];
|
||||
|
||||
@Controller('flyer-import')
|
||||
@@ -41,7 +44,7 @@ export class FlyerImportController {
|
||||
throw new BadRequestException('Ingen fil skickades med.');
|
||||
}
|
||||
if (!ALLOWED_MIMES.includes(file.mimetype)) {
|
||||
throw new BadRequestException('Otillåten filtyp. Använd PDF eller textfil.');
|
||||
throw new BadRequestException('Otillåten filtyp. Använd PDF, textfil eller bild (PNG, JPEG, WebP).');
|
||||
}
|
||||
|
||||
const userId =
|
||||
|
||||
@@ -2,10 +2,18 @@ import { Module } from '@nestjs/common';
|
||||
import { PrismaModule } from '../prisma/prisma.module';
|
||||
import { FlyerImportController } from './flyer-import.controller';
|
||||
import { FlyerImportService } from './flyer-import.service';
|
||||
import { TextExtractorService } from './services/text-extractor.service';
|
||||
import { AiFlyerParserService } from './services/ai-flyer-parser.service';
|
||||
import { FlyerNormalizerService } from './services/flyer-normalizer.service';
|
||||
|
||||
@Module({
|
||||
imports: [PrismaModule],
|
||||
controllers: [FlyerImportController],
|
||||
providers: [FlyerImportService],
|
||||
providers: [
|
||||
FlyerImportService,
|
||||
TextExtractorService,
|
||||
AiFlyerParserService,
|
||||
FlyerNormalizerService,
|
||||
],
|
||||
})
|
||||
export class FlyerImportModule {}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import {
|
||||
BadRequestException,
|
||||
Injectable,
|
||||
Logger,
|
||||
ServiceUnavailableException,
|
||||
import {
|
||||
BadRequestException,
|
||||
Injectable,
|
||||
Logger,
|
||||
ServiceUnavailableException,
|
||||
} from '@nestjs/common';
|
||||
import { Prisma } from '@prisma/client';
|
||||
import { PrismaService } from '../prisma/prisma.service';
|
||||
@@ -12,8 +12,9 @@ import {
|
||||
FlyerImportMatchVia,
|
||||
FlyerImportResponse,
|
||||
} from './dto/flyer-import.response';
|
||||
|
||||
const IMPORTER_SERVICE_URL = process.env.IMPORTER_SERVICE_URL || 'http://importer-api:3001';
|
||||
import { TextExtractorService } from './services/text-extractor.service';
|
||||
import { AiFlyerParserService } from './services/ai-flyer-parser.service';
|
||||
import { FlyerNormalizerService } from './services/flyer-normalizer.service';
|
||||
|
||||
type FlyerParseItem = {
|
||||
rawName: string;
|
||||
@@ -53,10 +54,15 @@ type ProductLite = {
|
||||
export class FlyerImportService {
|
||||
private readonly logger = new Logger(FlyerImportService.name);
|
||||
|
||||
constructor(private readonly prisma: PrismaService) {}
|
||||
constructor(
|
||||
private readonly prisma: PrismaService,
|
||||
private readonly textExtractor: TextExtractorService,
|
||||
private readonly aiParser: AiFlyerParserService,
|
||||
private readonly normalizer: FlyerNormalizerService,
|
||||
) {}
|
||||
|
||||
async parseAndMatch(file: Express.Multer.File, userId: number): Promise<FlyerImportResponse> {
|
||||
const parsed = await this.parseViaImporter(file);
|
||||
const parsed = await this.parseViaInternal(file);
|
||||
|
||||
const [products, aliases] = await Promise.all([
|
||||
this.prisma.product.findMany({
|
||||
@@ -371,43 +377,59 @@ export class FlyerImportService {
|
||||
return allowed.has(cleaned) ? cleaned : cleaned;
|
||||
}
|
||||
|
||||
private async parseViaImporter(file: Express.Multer.File): Promise<FlyerParseResponse> {
|
||||
const form = new FormData();
|
||||
form.append(
|
||||
'file',
|
||||
new Blob([new Uint8Array(file.buffer)], { type: file.mimetype }),
|
||||
file.originalname,
|
||||
);
|
||||
form.append('retailer', 'willys');
|
||||
|
||||
let response: Response;
|
||||
private async parseViaInternal(file: Express.Multer.File): Promise<FlyerParseResponse> {
|
||||
try {
|
||||
response = await fetch(`${IMPORTER_SERVICE_URL}/api/flyer/parse`, {
|
||||
method: 'POST',
|
||||
body: form,
|
||||
});
|
||||
} catch (err) {
|
||||
this.logger.error(`Kunde inte nå importer-api för flyer-parse: ${String(err)}`);
|
||||
throw new ServiceUnavailableException('Importer-tjänsten är inte tillgänglig just nu.');
|
||||
}
|
||||
this.logger.debug(`Parsing flyer file: ${file.originalname}`);
|
||||
|
||||
if (!response.ok) {
|
||||
let message = `Importer-tjänsten svarade ${response.status}`;
|
||||
try {
|
||||
const body = (await response.json()) as { message?: string };
|
||||
if (typeof body.message === 'string' && body.message.trim()) {
|
||||
message = body.message;
|
||||
}
|
||||
} catch {
|
||||
// ignore parse issues
|
||||
// 1. Extrahera text från PDF/bild
|
||||
const text = await this.textExtractor.extractText(
|
||||
file.buffer,
|
||||
file.mimetype,
|
||||
file.originalname,
|
||||
);
|
||||
|
||||
// 2. Skicka till Mistral Tiny
|
||||
const aiItems = await this.aiParser.parseWithAI(text);
|
||||
|
||||
// 3. Normalisera resultatet
|
||||
const normalizedItems = this.normalizer.normalize(aiItems);
|
||||
|
||||
// 4. Konvertera till intern FlyerParseItem-format
|
||||
const items: FlyerParseItem[] = normalizedItems.map((item) => ({
|
||||
rawName: item.rawName,
|
||||
normalizedName: item.normalizedName,
|
||||
category: item.categoryHint,
|
||||
price: item.price,
|
||||
priceUnit: item.priceUnit,
|
||||
comparisonPrice: item.comparisonPrice,
|
||||
comparisonUnit: item.comparisonUnit,
|
||||
offerText: item.offerText,
|
||||
confidence: item.parseConfidence,
|
||||
reasonCodes: item.parseReasons,
|
||||
}));
|
||||
|
||||
const warnings: string[] = [];
|
||||
if (items.length === 0) {
|
||||
warnings.push('Inga produkter kunde extraheras från flyern.');
|
||||
}
|
||||
|
||||
if (response.status >= 400 && response.status < 500) {
|
||||
throw new BadRequestException(message);
|
||||
}
|
||||
throw new ServiceUnavailableException(message);
|
||||
return {
|
||||
retailer: 'willys',
|
||||
parserVersion: 'v1',
|
||||
items,
|
||||
warnings,
|
||||
};
|
||||
} catch (err) {
|
||||
if (err instanceof BadRequestException) {
|
||||
throw err;
|
||||
}
|
||||
if (err instanceof ServiceUnavailableException) {
|
||||
throw err;
|
||||
}
|
||||
this.logger.error(`Internal flyer parse failed: ${String(err)}`);
|
||||
throw new BadRequestException(
|
||||
`Fel vid tolkning av flyer: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
}
|
||||
|
||||
return response.json() as Promise<FlyerParseResponse>;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,234 @@
|
||||
import {
|
||||
BadRequestException,
|
||||
Injectable,
|
||||
Logger,
|
||||
ServiceUnavailableException,
|
||||
} from '@nestjs/common';
|
||||
|
||||
export interface AiFlyerParseResult {
|
||||
rawName: string;
|
||||
normalizedName: string;
|
||||
category: string | null;
|
||||
price: number | null;
|
||||
priceUnit: string | null;
|
||||
comparisonPrice: number | null;
|
||||
comparisonUnit: string | null;
|
||||
offerText: string | null;
|
||||
confidence: number;
|
||||
reasonCodes: string[];
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class AiFlyerParserService {
|
||||
private readonly logger = new Logger(AiFlyerParserService.name);
|
||||
private readonly timeoutMs = 15_000;
|
||||
private mistral: any;
|
||||
private apiKey: string;
|
||||
|
||||
constructor() {
|
||||
this.apiKey = process.env.MISTRAL_API_KEY ?? '';
|
||||
if (!this.apiKey) {
|
||||
throw new Error('MISTRAL_API_KEY environment variable not set');
|
||||
}
|
||||
}
|
||||
|
||||
private async getClient(): Promise<any> {
|
||||
if (this.mistral) return this.mistral;
|
||||
const mistralModule = await import('@mistralai/mistralai');
|
||||
this.mistral = new mistralModule.default(this.apiKey);
|
||||
return this.mistral;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skickar flyer-text till Mistral Tiny för strukturerad extraktion.
|
||||
*
|
||||
* @param text Text från flyern (från pdf-parse eller OCR)
|
||||
* @returns Array av parsade produkter
|
||||
*/
|
||||
async parseWithAI(text: string): Promise<AiFlyerParseResult[]> {
|
||||
if (!text || text.trim().length === 0) {
|
||||
throw new BadRequestException('Flyer-texten är tom. Kan inte fortsätta.');
|
||||
}
|
||||
|
||||
const prompt = this.buildPrompt(text);
|
||||
|
||||
try {
|
||||
this.logger.debug('Sending request to Mistral Tiny');
|
||||
|
||||
const client = await this.getClient();
|
||||
const response = await this.withTimeout<any>(
|
||||
client.chat({
|
||||
model: 'mistral-tiny',
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
temperature: 0.1,
|
||||
}),
|
||||
this.timeoutMs,
|
||||
'Mistral-anrop timeout',
|
||||
);
|
||||
|
||||
const content = response.choices?.[0]?.message?.content;
|
||||
if (!content) {
|
||||
throw new BadRequestException('Tomt svar från AI-modellen.');
|
||||
}
|
||||
|
||||
this.logger.debug(`Mistral response length: ${content.length} chars`);
|
||||
|
||||
// Rensa och parse JSON
|
||||
const jsonString = this.sanitizeJsonResponse(content);
|
||||
const items = JSON.parse(jsonString) as Array<Record<string, unknown>>;
|
||||
|
||||
if (!Array.isArray(items)) {
|
||||
throw new BadRequestException('AI returnerade inte en JSON-array.');
|
||||
}
|
||||
|
||||
return items.map((item, idx) => this.normalizeAiItem(item, idx));
|
||||
} catch (err) {
|
||||
if (err instanceof SyntaxError) {
|
||||
this.logger.error(`JSON parse error: ${String(err)}`);
|
||||
throw new BadRequestException('AI returnerade ogiltigt JSON. Försök igen.');
|
||||
}
|
||||
if (err instanceof BadRequestException) {
|
||||
throw err;
|
||||
}
|
||||
if (err instanceof ServiceUnavailableException) {
|
||||
throw err;
|
||||
}
|
||||
this.logger.error(`AI parsing failed: ${String(err)}`);
|
||||
throw new ServiceUnavailableException('AI-tjänsten är inte tillgänglig just nu.');
|
||||
}
|
||||
}
|
||||
|
||||
private async withTimeout<T>(
|
||||
promise: Promise<T>,
|
||||
timeoutMs: number,
|
||||
timeoutMessage: string,
|
||||
): Promise<T> {
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | null = null;
|
||||
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
reject(new ServiceUnavailableException(timeoutMessage));
|
||||
}, timeoutMs);
|
||||
});
|
||||
|
||||
try {
|
||||
return await Promise.race([promise, timeoutPromise]);
|
||||
} finally {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bygger systemprompten för Mistral.
|
||||
*/
|
||||
private buildPrompt(text: string): string {
|
||||
// Trunkera långt text för att spara tokens
|
||||
const truncatedText = text.length > 5000 ? text.substring(0, 5000) : text;
|
||||
|
||||
return `Du är en expert på att tolka svenska matvaruflyers (t.ex. från Willys, Coop, ICA).
|
||||
|
||||
Extrahera ALL produktinformation från följande text och returnera den som en JSON-array.
|
||||
|
||||
För varje produkt, inkludera:
|
||||
- name: Produktnamn (fullständigt namn)
|
||||
- weight: Vikt (om tillgänglig, t.ex. "150g", "Ca 1kg") eller null
|
||||
- origin: Ursprung/land/märke (om tillgänglig, t.ex. "FALKENBERG") eller null
|
||||
- price: Pris som nummer (t.ex. 39.90) eller null
|
||||
- comparisonPrice: Jämförpris som nummer (t.ex. 266.00) eller null
|
||||
- unit: Enhet (kg, st, förp, l, etc.) eller null
|
||||
- offer: Erbjudande som array (t.ex. ["Max 3 köp/hushåll"]) eller []
|
||||
- category: Kategori (t.ex. "Fisk", "Kött", "Mejeri", "Grönsaker", "Frukt", "Dryck") eller null
|
||||
- validFrom: Giltig från (datum i formatet YYYY-MM-DD) eller null
|
||||
- validTo: Giltig till (datum i formatet YYYY-MM-DD) eller null
|
||||
|
||||
Texten att tolka:
|
||||
${truncatedText}
|
||||
|
||||
Returnera ENDAST en JSON-array. Inga andra kommentarer, ingen markdown-markup.
|
||||
Exempel på utdata:
|
||||
[
|
||||
{
|
||||
"name": "KALLRÖKT LAX, GRAVAD LAX",
|
||||
"weight": "150g",
|
||||
"origin": "FALKENBERG",
|
||||
"price": 39.90,
|
||||
"comparisonPrice": 266.00,
|
||||
"unit": "kg",
|
||||
"offer": ["Max 3 köp/hushåll"],
|
||||
"category": "Fisk",
|
||||
"validFrom": "2026-05-18",
|
||||
"validTo": "2026-05-24"
|
||||
}
|
||||
]`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rensa AI-svaret för att kunna parse som JSON.
|
||||
*/
|
||||
private sanitizeJsonResponse(content: string): string {
|
||||
// Ta bort markdown fences
|
||||
let cleaned = content.replace(/```json\n?/g, '').replace(/```\n?/g, '');
|
||||
cleaned = cleaned.trim();
|
||||
|
||||
// Försök att extrahera JSON om det finns omgivande text
|
||||
const jsonMatch = cleaned.match(/\[[\s\S]*\]/);
|
||||
if (jsonMatch) {
|
||||
cleaned = jsonMatch[0];
|
||||
}
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normaliserar och typkonverterar AI-item till vårt format.
|
||||
*/
|
||||
private normalizeAiItem(item: Record<string, unknown>, index: number): AiFlyerParseResult {
|
||||
const toNumber = (val: unknown): number | null => {
|
||||
if (typeof val === 'number') return val;
|
||||
if (typeof val === 'string') {
|
||||
const parsed = parseFloat(val.replace(',', '.'));
|
||||
return isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
const toString = (val: unknown): string | null => {
|
||||
if (typeof val === 'string') return val.trim() || null;
|
||||
return null;
|
||||
};
|
||||
|
||||
const toArray = (val: unknown): string[] => {
|
||||
if (Array.isArray(val)) {
|
||||
return val.map(v => String(v)).filter(v => v.trim());
|
||||
}
|
||||
return [];
|
||||
};
|
||||
|
||||
const rawName = toString(item.name) || `Produkt ${index + 1}`;
|
||||
const normalizedName = this.normalizeName(rawName);
|
||||
|
||||
return {
|
||||
rawName,
|
||||
normalizedName,
|
||||
category: toString(item.category),
|
||||
price: toNumber(item.price),
|
||||
priceUnit: toString(item.unit),
|
||||
comparisonPrice: toNumber(item.comparisonPrice),
|
||||
comparisonUnit: toString(item.comparisonUnit),
|
||||
offerText: toString(item.offer) || (toArray(item.offer).join(' ') || null),
|
||||
confidence: 0.85, // AI-parse får medelhög confidence
|
||||
reasonCodes: ['ai_parsed'],
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Enkel normalisering av produktnamn.
|
||||
*/
|
||||
private normalizeName(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-zåäö0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
import { Test, TestingModule } from '@nestjs/testing';
|
||||
import { FlyerNormalizerService } from './flyer-normalizer.service';
|
||||
|
||||
describe('FlyerNormalizerService', () => {
|
||||
let service: FlyerNormalizerService;
|
||||
|
||||
beforeEach(async () => {
|
||||
const module: TestingModule = await Test.createTestingModule({
|
||||
providers: [FlyerNormalizerService],
|
||||
}).compile();
|
||||
|
||||
service = module.get<FlyerNormalizerService>(FlyerNormalizerService);
|
||||
});
|
||||
|
||||
it('should be defined', () => {
|
||||
expect(service).toBeDefined();
|
||||
});
|
||||
|
||||
describe('normalize', () => {
|
||||
it('should normalize a valid item', () => {
|
||||
const items = [
|
||||
{
|
||||
rawName: 'KALLRÖKT LAX, GRAVAD LAX',
|
||||
normalizedName: 'kallrökt lax gravad lax',
|
||||
category: 'Fisk',
|
||||
price: 39.9,
|
||||
comparisonPrice: 266.0,
|
||||
unit: 'kg',
|
||||
offer: ['Max 3 köp/hushåll'],
|
||||
confidence: 0.85,
|
||||
reasonCodes: ['ai_parsed'],
|
||||
},
|
||||
];
|
||||
|
||||
const result = service.normalize(items);
|
||||
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].rawName).toBe('KALLRÖKT LAX, GRAVAD LAX');
|
||||
expect(result[0].price).toBe(39.9);
|
||||
expect(result[0].priceUnit).toBe('kg');
|
||||
expect(result[0].categoryHint).toBe('Fisk');
|
||||
});
|
||||
|
||||
it('should handle missing fields gracefully', () => {
|
||||
const items = [
|
||||
{
|
||||
name: 'PRODUKT',
|
||||
// andra fält saknas
|
||||
},
|
||||
];
|
||||
|
||||
const result = service.normalize(items);
|
||||
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].rawName).toBe('PRODUKT');
|
||||
expect(result[0].price).toBeNull();
|
||||
expect(result[0].categoryHint).toBeNull();
|
||||
});
|
||||
|
||||
it('should skip items without name', () => {
|
||||
const items = [
|
||||
{ price: 100 }, // no name
|
||||
{ rawName: 'VALID PRODUCT', price: 50 },
|
||||
];
|
||||
|
||||
const result = service.normalize(items);
|
||||
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].rawName).toBe('VALID PRODUCT');
|
||||
});
|
||||
|
||||
it('should normalize units correctly', () => {
|
||||
const items = [
|
||||
{ rawName: 'Mjölk', unit: 'L' },
|
||||
{ rawName: 'Smör', unit: 'styck' },
|
||||
{ rawName: 'Socker', unit: 'KG' },
|
||||
];
|
||||
|
||||
const result = service.normalize(items);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].priceUnit).toBe('l');
|
||||
expect(result[1].priceUnit).toBe('st');
|
||||
expect(result[2].priceUnit).toBe('kg');
|
||||
});
|
||||
|
||||
it('should parse Swedish prices correctly', () => {
|
||||
const items = [
|
||||
{ rawName: 'Produkt1', price: '39,90' },
|
||||
{ rawName: 'Produkt2', price: 39.9 },
|
||||
{ rawName: 'Produkt3', price: '100' },
|
||||
];
|
||||
|
||||
const result = service.normalize(items);
|
||||
|
||||
expect(result[0].price).toBe(39.9);
|
||||
expect(result[1].price).toBe(39.9);
|
||||
expect(result[2].price).toBe(100);
|
||||
});
|
||||
|
||||
it('should return empty list for non-array input', () => {
|
||||
const result = service.normalize(null as any);
|
||||
expect(result).toEqual([]);
|
||||
|
||||
const result2 = service.normalize(undefined as any);
|
||||
expect(result2).toEqual([]);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,158 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
|
||||
export interface NormalizedFlyerItem {
|
||||
rawName: string;
|
||||
normalizedName: string;
|
||||
categoryHint: string | null;
|
||||
price: number | null;
|
||||
priceUnit: string | null;
|
||||
comparisonPrice: number | null;
|
||||
comparisonUnit: string | null;
|
||||
offerText: string | null;
|
||||
parseConfidence: number;
|
||||
parseReasons: string[];
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class FlyerNormalizerService {
|
||||
private readonly logger = new Logger(FlyerNormalizerService.name);
|
||||
|
||||
private readonly UNIT_MAPPING: Record<string, string> = {
|
||||
// Längd
|
||||
mm: 'mm',
|
||||
cm: 'cm',
|
||||
m: 'm',
|
||||
// Vikt
|
||||
mg: 'mg',
|
||||
g: 'g',
|
||||
hg: 'hg',
|
||||
kg: 'kg',
|
||||
ton: 'ton',
|
||||
// Volym
|
||||
ml: 'ml',
|
||||
cl: 'cl',
|
||||
dl: 'dl',
|
||||
l: 'l',
|
||||
// Övrigt
|
||||
st: 'st',
|
||||
styck: 'st',
|
||||
stycke: 'st',
|
||||
pkt: 'pkt',
|
||||
paket: 'pkt',
|
||||
fp: 'pkt',
|
||||
förp: 'pkt',
|
||||
förpackning: 'pkt',
|
||||
};
|
||||
|
||||
/**
|
||||
* Normaliserar en AI-parsad produktlista.
|
||||
*/
|
||||
normalize(items: any[]): NormalizedFlyerItem[] {
|
||||
if (!Array.isArray(items)) {
|
||||
this.logger.warn('normalize() received non-array, returning empty list');
|
||||
return [];
|
||||
}
|
||||
|
||||
return items
|
||||
.map((item, idx) => this.normalizeItem(item, idx))
|
||||
.filter((item): item is NormalizedFlyerItem => item !== null);
|
||||
}
|
||||
|
||||
private normalizeItem(item: any, index: number): NormalizedFlyerItem | null {
|
||||
if (!item || typeof item !== 'object') {
|
||||
this.logger.warn(`Item ${index} is not an object, skipping`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const rawName = this.extractString(item.rawName) || this.extractString(item.name);
|
||||
if (!rawName) {
|
||||
this.logger.warn(`Item ${index} has no name, skipping`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const normalizedName = this.extractString(item.normalizedName) || this.normalizeName(rawName);
|
||||
|
||||
return {
|
||||
rawName,
|
||||
normalizedName,
|
||||
categoryHint: this.normalizeCategory(this.extractString(item.category)),
|
||||
price: this.extractPrice(item.price),
|
||||
priceUnit: this.normalizeUnit(this.extractString(item.unit)),
|
||||
comparisonPrice: this.extractPrice(item.comparisonPrice),
|
||||
comparisonUnit: this.normalizeUnit(this.extractString(item.comparisonUnit)),
|
||||
offerText: this.normalizeOfferText(item.offer),
|
||||
parseConfidence: item.confidence ?? 0.85,
|
||||
parseReasons: Array.isArray(item.reasonCodes)
|
||||
? item.reasonCodes.map(String)
|
||||
: ['normalized'],
|
||||
};
|
||||
}
|
||||
|
||||
private extractString(val: any): string | null {
|
||||
if (typeof val === 'string') return val.trim() || null;
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractPrice(val: any): number | null {
|
||||
if (typeof val === 'number') return val;
|
||||
if (typeof val === 'string') {
|
||||
const num = parseFloat(val.replace(/,/g, '.'));
|
||||
return isFinite(num) ? num : null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private normalizeName(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-zåäö0-9\s]/g, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
private normalizeUnit(unit: string | null): string | null {
|
||||
if (!unit) return null;
|
||||
|
||||
const cleaned = unit.trim().toLowerCase().replace(/\./g, '');
|
||||
return this.UNIT_MAPPING[cleaned] ?? null;
|
||||
}
|
||||
|
||||
private normalizeCategory(category: string | null): string | null {
|
||||
if (!category) return null;
|
||||
|
||||
const normalized = category.trim().toLowerCase();
|
||||
|
||||
// Mappning av tänkta kategorivärdena från AI
|
||||
const categoryMap: Record<string, string> = {
|
||||
fisk: 'Fisk',
|
||||
kött: 'Kött',
|
||||
mejeri: 'Mejeri',
|
||||
grönsaker: 'Grönsaker',
|
||||
frukt: 'Frukt',
|
||||
dryck: 'Dryck',
|
||||
frukt_grönsaker: 'Frukt & Grönsaker',
|
||||
fastfood: 'Fastfood',
|
||||
bröd: 'Bröd',
|
||||
fryst: 'Fryst',
|
||||
godis: 'Godis',
|
||||
pasta: 'Pasta',
|
||||
};
|
||||
|
||||
return categoryMap[normalized] ?? null;
|
||||
}
|
||||
|
||||
private normalizeOfferText(offer: any): string | null {
|
||||
if (!offer) return null;
|
||||
|
||||
if (typeof offer === 'string') {
|
||||
return offer.trim() || null;
|
||||
}
|
||||
|
||||
if (Array.isArray(offer)) {
|
||||
const joined = offer.map(String).filter(s => s.trim()).join(' ');
|
||||
return joined || null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import * as pdf from 'pdf-parse';
|
||||
import Tesseract from 'tesseract.js';
|
||||
|
||||
@Injectable()
|
||||
export class TextExtractorService {
|
||||
private readonly logger = new Logger(TextExtractorService.name);
|
||||
|
||||
/**
|
||||
* Extraherar text från en PDF-buffer.
|
||||
* Försöker med pdf-parse först; om det inte ger resultat, fallback till OCR.
|
||||
*
|
||||
* @param buffer PDF-fil som buffer
|
||||
* @returns Extraherad text
|
||||
*/
|
||||
async extractText(
|
||||
buffer: Buffer,
|
||||
mimeType?: string,
|
||||
originalFilename?: string,
|
||||
): Promise<string> {
|
||||
// Försök primär PDF-extract
|
||||
try {
|
||||
this.logger.debug('Attempting pdf-parse extraction');
|
||||
const pdfData = await pdf(buffer);
|
||||
|
||||
const text = pdfData.text?.trim() || '';
|
||||
const wordCount = text.split(/\s+/).filter(w => w.length > 0).length;
|
||||
|
||||
this.logger.debug(`pdf-parse extracted ${wordCount} words`);
|
||||
|
||||
// Om vi fick tillräckligt med text, returnera det
|
||||
if (wordCount >= 10) {
|
||||
return text;
|
||||
}
|
||||
|
||||
this.logger.debug('pdf-parse gave too little text, falling back to OCR');
|
||||
} catch (err) {
|
||||
this.logger.warn(`pdf-parse failed: ${String(err)}`);
|
||||
}
|
||||
|
||||
// Fallback: OCR med Tesseract
|
||||
return this.extractTextViaOCR(buffer, mimeType, originalFilename);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extraherar text från en PDF eller bild via OCR (Tesseract).
|
||||
*
|
||||
* @param buffer Fil-buffer (PDF eller bild)
|
||||
* @returns Extraherad text
|
||||
*/
|
||||
private async extractTextViaOCR(
|
||||
buffer: Buffer,
|
||||
mimeType?: string,
|
||||
originalFilename?: string,
|
||||
): Promise<string> {
|
||||
try {
|
||||
this.logger.debug('Starting Tesseract OCR extraction');
|
||||
|
||||
// Tesseract.js kräver en sökväg eller data-URL; vi skriver temporär fil
|
||||
const ext = this.resolveTempExtension(mimeType, originalFilename);
|
||||
const tempPath = path.join(os.tmpdir(), `ocr-${Date.now()}${ext}`);
|
||||
await fs.promises.writeFile(tempPath, buffer);
|
||||
|
||||
try {
|
||||
const result = await Tesseract.recognize(tempPath, 'swe', {
|
||||
logger: (m) => this.logger.debug(`Tesseract: ${m.status}`),
|
||||
});
|
||||
|
||||
const text = result.data.text || '';
|
||||
this.logger.debug(`Tesseract extracted ${text.split(/\s+/).length} words`);
|
||||
return text;
|
||||
} finally {
|
||||
try {
|
||||
await fs.promises.unlink(tempPath);
|
||||
} catch {
|
||||
// ignorera om cleanup misslyckas
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.error(`OCR extraction failed: ${String(err)}`);
|
||||
throw new Error('Kunde inte extrahera text från flyern (pdf-parse + OCR misslyckades).');
|
||||
}
|
||||
}
|
||||
|
||||
private resolveTempExtension(mimeType?: string, originalFilename?: string): string {
|
||||
if (mimeType === 'image/png') return '.png';
|
||||
if (mimeType === 'image/webp') return '.webp';
|
||||
if (mimeType === 'image/jpeg') return '.jpg';
|
||||
if (mimeType === 'text/plain') return '.txt';
|
||||
if (mimeType === 'application/pdf') return '.pdf';
|
||||
|
||||
const originalExt = originalFilename ? path.extname(originalFilename).toLowerCase() : '';
|
||||
if (originalExt) return originalExt;
|
||||
|
||||
return '.pdf';
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user