import { BadRequestException, Injectable, Logger, ServiceUnavailableException, } from '@nestjs/common'; import * as fs from 'fs'; import * as path from 'path'; export interface AiFlyerParseResult { rawName: string; normalizedName: string; brand: string | null; category: string | null; price: number | null; priceUnit: string | null; comparisonPrice: number | null; comparisonUnit: string | null; weight: string | null; bundleWeight: string | null; isBundle: boolean; bundleItems: string[]; offerText: string | null; confidence: number; reasonCodes: string[]; } export interface AiFlyerParseTrace { prompt: string | null; rawOutput: string | null; chunkCount: number; retryCount: number; } @Injectable() export class AiFlyerParserService { private readonly logger = new Logger(AiFlyerParserService.name); private readonly timeoutMs: number; private readonly maxRetries: number; private readonly chunkSizeChars: number; private readonly chunkOverlapChars: number; private readonly maxChunks: number; private readonly debugEnabled: boolean; private readonly debugDirectory: string; private mistral: any; private apiKey: string; constructor() { this.apiKey = process.env.MISTRAL_API_KEY ?? ''; if (!this.apiKey) { throw new Error('MISTRAL_API_KEY environment variable not set'); } this.timeoutMs = this.readPositiveIntEnv('FLYER_AI_TIMEOUT_MS', 30_000); this.maxRetries = this.readPositiveIntEnv('FLYER_AI_RETRIES', 2); this.chunkSizeChars = this.readPositiveIntEnv('FLYER_AI_CHUNK_SIZE_CHARS', 3_000); this.chunkOverlapChars = this.readPositiveIntEnv('FLYER_AI_CHUNK_OVERLAP_CHARS', 300); this.maxChunks = this.readPositiveIntEnv('FLYER_AI_MAX_CHUNKS', 8); this.debugEnabled = this.readBooleanEnv('FLYER_AI_DEBUG', false); this.debugDirectory = process.env.FLYER_AI_DEBUG_DIR?.trim() || path.join(process.cwd(), 'debug'); } private async getClient(): Promise { if (this.mistral) return this.mistral; const mistralModule = await import('@mistralai/mistralai'); this.mistral = new mistralModule.default(this.apiKey); return this.mistral; } /** * Skickar flyer-text till mistral-8b-2512 för strukturerad extraktion. * * @param text Text från flyern (från pdf-parse eller OCR) * @returns Array av parsade produkter */ async parseWithAI(text: string): Promise<{ items: AiFlyerParseResult[]; trace: AiFlyerParseTrace }> { if (!text || text.trim().length === 0) { throw new BadRequestException('Flyer-texten är tom. Kan inte fortsätta.'); } const debugSession = this.createDebugSession('AI-flyerimporter'); try { if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-input.txt`, text, ); } const client = await this.getClient(); const chunks = this.splitIntoChunks(text); this.logger.debug(`Parsing flyer text in ${chunks.length} chunk(s)`); if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-chunks.json`, JSON.stringify(chunks, null, 2), ); } const allItems: AiFlyerParseResult[] = []; const prompts: string[] = []; const rawResponses: string[] = []; let retryCount = 0; for (let i = 0; i < chunks.length; i++) { const chunkResult = await this.parseChunkWithRetry( client, chunks[i], i + 1, chunks.length, debugSession, ); allItems.push(...chunkResult.items); prompts.push(chunkResult.prompt); rawResponses.push(chunkResult.rawOutput); retryCount += Math.max(0, chunkResult.attemptsUsed - 1); } const deduped = this.dedupeItems(allItems); const trace: AiFlyerParseTrace = { prompt: prompts.length > 0 ? prompts.join('\n\n-----\n\n') : null, rawOutput: rawResponses.length > 0 ? rawResponses.join('\n\n-----\n\n') : null, chunkCount: chunks.length, retryCount, }; if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-result.json`, JSON.stringify(deduped, null, 2), ); } return { items: deduped, trace }; } catch (err) { if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-error.txt`, this.toErrorMessage(err), ); } if (err instanceof SyntaxError) { this.logger.error(`JSON parse error: ${String(err)}`); throw new BadRequestException('AI returnerade ogiltigt JSON. Försök igen.'); } if (err instanceof BadRequestException) { throw err; } if (err instanceof ServiceUnavailableException) { throw err; } this.logger.error(`AI parsing failed: ${String(err)}`); throw new ServiceUnavailableException('AI-tjänsten är inte tillgänglig just nu.'); } } private async withTimeout( promise: Promise, timeoutMs: number, timeoutMessage: string, ): Promise { let timeoutHandle: ReturnType | null = null; const timeoutPromise = new Promise((_, reject) => { timeoutHandle = setTimeout(() => { reject(new ServiceUnavailableException(timeoutMessage)); }, timeoutMs); }); try { return await Promise.race([promise, timeoutPromise]); } finally { if (timeoutHandle) clearTimeout(timeoutHandle); } } /** * Bygger systemprompten för Mistral. */ private buildPrompt(text: string, maxTextLength: number): string { const truncatedText = text.length > maxTextLength ? text.substring(0, maxTextLength) : text; return `Du tolkar svenska matvaruflyers och ska returnera ENDAST en JSON-array. Returnera objekt med exakt dessa fält: - name: string (produkttitel) - brand: string | null - category: string | null - isBundle: boolean - weight: string | null (vikt/storlek for en enskild produkt) - bundleWeight: string | null (totalvikt for hela kombipaketet) - bundleItems: string[] (ingående produkter i paketet, tom array om ej bundle) - price: number | null - comparisonPrice: number | null - unit: string | null (enhet for jamforpris, t.ex. kg/l/st) - offer: string[] Arbetssatt (viktigt): Steg A) Identifiera om texten ar en gruppannons med flera varianter + gemensamma attribut. Steg B) Returnera en post per faktisk produktvariant med arvd metadata. Regler: 1) Vanlig produkt (ej bundle): isBundle=false, bundleWeight=null, bundleItems=[]. 2) Kombipaket/bundle: isBundle=true, name ska vara paketets huvudnamn, bundleWeight totalvikt. 3) For bundle ska bundleItems innehalla de ingaende produkterna, t.ex. ["Chumlax 3x100g", "Alaska pollock 3x100g"]. 4) price ar priset for hela forpackningen. comparisonPrice ar jamforpris som tal ("83:17" -> 83.17). 5) offer innehaller kampanjtext som "Max 10 kop/hushall". 6) Om en rubrik/lista innehaller flera kommaseparerade namn och efterfoljande rad/rader innehaller gemensam brand, vikt, pris eller kampanjvillkor: expandera till separata objekt (en per namn) och arv all gemensam metadata. 7) Tillämpa samma split-regel generellt for liknande tillbud (inte bara ost), nar listan tydligt representerar produktvarianter/smaker/sorter. 8) Splitta INTE om listan snarare ar ingredienser, avdelningar, eller otydlig marknadsforing utan tydlig produktvariant. 9) Specialregel ost: namn som PRAST/HERRGARD/GREVE ska normaliseras till Prästost/Herrgårdsost/Grevéost. 10) Om texten innehaller "ARLA KO" ska brand vara exakt "Arla Ko". 11) For ovan ostsorter ska category vara "Hardost". 12) Behåll svenska diakritiska tecken (ä, å, ö, é) i produktnamn. Returnera "Prästost", "Herrgårdsost", "Grevéost" - inte ASCII-versioner. 13) Returnera aldrig extra nycklar, text, markdown eller forklaringar utanfor JSON-arrayen. Exempel bundle utdata: [ { "name": "Kaptenens Favoriter", "brand": "Kapten Royal", "category": "Fisk", "isBundle": true, "weight": null, "bundleWeight": "600g", "bundleItems": ["Chumlax 3x100g", "Alaska pollock 3x100g"], "price": 49.90, "comparisonPrice": 83.17, "unit": "kg", "offer": ["Max 10 kop/hushall"] } ] Exempel enkel produkt utdata: [ { "name": "ICA Basic Mjolk 1,5%", "brand": "ICA Basic", "category": "Mejeri", "isBundle": false, "weight": "1l", "bundleWeight": null, "bundleItems": [], "price": 12.90, "comparisonPrice": 12.90, "unit": "l", "offer": [] } ] Exempel gruppannons med varianter (ska splittas): Input-idé: "PRAST, HERRGARD, GREVE" + "ARLA KO" + gemensam vikt/pris. Output-idé: [ { "name": "Prästost", "brand": "Arla Ko", "category": "Hardost", "isBundle": false, "weight": "667g", "bundleWeight": null, "bundleItems": [], "price": null, "comparisonPrice": 79.90, "unit": "kg", "offer": ["Max 3 forp/hushall"] }, { "name": "Herrgårdsost", "brand": "Arla Ko", "category": "Hardost", "isBundle": false, "weight": "667g", "bundleWeight": null, "bundleItems": [], "price": null, "comparisonPrice": 79.90, "unit": "kg", "offer": ["Max 3 forp/hushall"] } ] Exempel negativt fall (ska INTE splittas): Input-idé: "Ingredienser: tomat, lok, vitlok". Output-idé: en produktpost (ingen variant-expansion). Text att tolka: ${truncatedText}`; } /** * Rensa AI-svaret för att kunna parse som JSON. */ private sanitizeJsonResponse(content: string): string { let cleaned = content.replace(/```json\n?/g, '').replace(/```\n?/g, ''); cleaned = cleaned.trim(); const jsonMatch = cleaned.match(/\[[\s\S]*\]/); if (jsonMatch) { cleaned = jsonMatch[0]; } return cleaned; } /** * Normaliserar och typkonverterar AI-item till vårt format. */ private normalizeAiItem(item: Record, index: number): AiFlyerParseResult { const toNumber = (val: unknown): number | null => { if (typeof val === 'number') return val; if (typeof val === 'string') { const parsed = parseFloat(val.replace(',', '.')); return isFinite(parsed) ? parsed : null; } return null; }; const toString = (val: unknown): string | null => { if (typeof val === 'string') return val.trim() || null; return null; }; const toArray = (val: unknown): string[] => { if (Array.isArray(val)) { return val.map(v => String(v)).filter(v => v.trim()); } return []; }; const rawName = toString(item.name) || `Produkt ${index + 1}`; const normalizedName = this.normalizeName(rawName); return { rawName, normalizedName, brand: toString(item.brand), category: toString(item.category), price: toNumber(item.price), priceUnit: toString(item.unit), comparisonPrice: toNumber(item.comparisonPrice), comparisonUnit: toString(item.comparisonUnit), weight: toString(item.weight), bundleWeight: toString(item.bundleWeight), isBundle: Boolean(item.isBundle), bundleItems: toArray(item.bundleItems), offerText: toString(item.offer) || (toArray(item.offer).join(' ') || null), confidence: 0.85, reasonCodes: ['ai_parsed'], }; } private normalizeName(name: string): string { return name .toLowerCase() .replace(/[^a-zåäöé0-9\s]/g, '') .replace(/\s+/g, ' ') .trim(); } private splitIntoChunks(text: string): string[] { const normalized = text.replace(/\r\n/g, '\n').trim(); if (!normalized) return []; if (normalized.length <= this.chunkSizeChars) { return [normalized]; } const chunks: string[] = []; let start = 0; while (start < normalized.length && chunks.length < this.maxChunks) { const end = Math.min(start + this.chunkSizeChars, normalized.length); const chunk = normalized.slice(start, end).trim(); if (chunk) chunks.push(chunk); if (end >= normalized.length) break; start = Math.max(0, end - this.chunkOverlapChars); } return chunks; } private async parseChunkWithRetry( client: any, chunkText: string, chunkIndex: number, totalChunks: number, debugSession: { dirPath: string; baseName: string } | null, ): Promise<{ items: AiFlyerParseResult[]; prompt: string; rawOutput: string; attemptsUsed: number; }> { const textWindows = [3000, 2200, 1600]; const attempts = Math.max(1, Math.min(this.maxRetries + 1, textWindows.length)); let lastError: unknown = null; for (let i = 0; i < attempts; i++) { const window = textWindows[i]; const prompt = this.buildPrompt(chunkText, window); try { this.logger.debug( `Sending request to Mistral Tiny (chunk ${chunkIndex}/${totalChunks}, attempt ${i + 1}/${attempts}, timeout=${this.timeoutMs}ms, textWindow=${window})`, ); if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-chunk-${chunkIndex}-attempt-${i + 1}-prompt.txt`, prompt, ); } const response = await this.withTimeout( client.chat({ model: 'ministral-8b-2512', messages: [{ role: 'user', content: prompt }], temperature: 0.1, }), this.timeoutMs, 'Mistral-anrop timeout', ); const content = this.ensureUtf8Content(response.choices?.[0]?.message?.content); if (!content) { throw new BadRequestException('Tomt svar från AI-modellen.'); } this.logger.debug(`Mistral response length: ${content.length} chars`); if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-chunk-${chunkIndex}-attempt-${i + 1}-response.txt`, String(content), ); } const jsonString = this.sanitizeJsonResponse(content); const items = JSON.parse(jsonString) as Array>; if (!Array.isArray(items)) { throw new BadRequestException('AI returnerade inte en JSON-array.'); } return { items: items.map((aiItem, idx) => this.normalizeAiItem(aiItem, idx)), prompt, rawOutput: String(content), attemptsUsed: i + 1, }; } catch (attemptErr) { lastError = attemptErr; if (debugSession) { await this.writeDebugFile( debugSession, `${debugSession.baseName}-chunk-${chunkIndex}-attempt-${i + 1}-error.txt`, this.toErrorMessage(attemptErr), ); } if (!this.isRetryableError(attemptErr) || i === attempts - 1) { throw attemptErr; } this.logger.warn( `Mistral chunk ${chunkIndex}/${totalChunks} attempt ${i + 1} failed (${this.toErrorMessage(attemptErr)}). Retrying with shorter text window.`, ); } } throw lastError instanceof Error ? lastError : new ServiceUnavailableException('AI-anrop misslyckades'); } private dedupeItems(items: AiFlyerParseResult[]): AiFlyerParseResult[] { const seen = new Set(); const deduped: AiFlyerParseResult[] = []; for (const item of items) { const normalizedName = item.normalizedName.trim(); const normalizedBrand = (item.brand ?? '').trim().toLowerCase(); const normalizedPrice = item.price == null ? '' : Number(item.price).toFixed(2); const normalizedPriceUnit = (item.priceUnit ?? '').trim().toLowerCase(); const normalizedComparisonPrice = item.comparisonPrice == null ? '' : Number(item.comparisonPrice).toFixed(2); const normalizedComparisonUnit = (item.comparisonUnit ?? '').trim().toLowerCase(); const offerSignature = this.offerSignature(item.offerText); const key = [ normalizedName, normalizedBrand, normalizedPrice, normalizedPriceUnit, normalizedComparisonPrice, normalizedComparisonUnit, offerSignature, item.isBundle ? '1' : '0', ].join('|'); if (seen.has(key)) continue; seen.add(key); deduped.push(item); } return deduped; } private offerSignature(offerText: string | null | undefined): string { if (!offerText || offerText.trim().length === 0) return ''; const normalized = offerText .toLowerCase() .normalize('NFD') .replace(/[\u0300-\u036f]/g, '') .replace(/[^a-z0-9\s]/g, ' ') .replace(/\s+/g, ' ') .trim(); if (!normalized) return ''; const hasCampaignMarkers = /(max|hogst|begransat|hushall|kund|kop|for|betala|ta)/.test(normalized) || /(\d+\s*for\s*\d+)/.test(normalized) || /(ta\s*\d+\s*betala\s*for\s*\d+)/.test(normalized); return hasCampaignMarkers ? normalized : ''; } private ensureUtf8Content(content: unknown): string { const asString = this.flattenContent(content); if (!asString) return ''; const utf8 = Buffer.from(asString, 'utf8').toString('utf8'); if (this.debugEnabled && (asString.includes('\uFFFD') || utf8.includes('\uFFFD'))) { const hex = Buffer.from(asString, 'utf8').toString('hex').slice(0, 256); this.logger.debug(`Potential encoding issue in AI response (hex preview): ${hex}`); } return utf8; } private flattenContent(content: unknown): string { if (typeof content === 'string') { return content; } if (Array.isArray(content)) { return content .map((part) => { if (typeof part === 'string') return part; if (part && typeof part === 'object' && 'text' in part) { const text = (part as { text?: unknown }).text; return typeof text === 'string' ? text : ''; } return ''; }) .join(''); } if (content == null) { return ''; } return String(content); } private readPositiveIntEnv(key: string, fallback: number): number { const raw = process.env[key]; if (!raw) return fallback; const parsed = Number.parseInt(raw, 10); if (!Number.isFinite(parsed) || parsed <= 0) { this.logger.warn(`Invalid ${key} value: "${raw}". Falling back to ${fallback}.`); return fallback; } return parsed; } private readBooleanEnv(key: string, fallback: boolean): boolean { const raw = process.env[key]; if (!raw) return fallback; return ['1', 'true', 'yes', 'on'].includes(raw.trim().toLowerCase()); } private createDebugSession(prefix: string): { dirPath: string; baseName: string } | null { if (!this.debugEnabled) return null; const now = new Date(); const y = String(now.getFullYear()).slice(-2); const m = String(now.getMonth() + 1).padStart(2, '0'); const d = String(now.getDate()).padStart(2, '0'); const hh = String(now.getHours()).padStart(2, '0'); const mm = String(now.getMinutes()).padStart(2, '0'); const ss = String(now.getSeconds()).padStart(2, '0'); const datePart = `${y}${m}${d}`; const timePart = `${hh}${mm}${ss}`; const baseName = `${prefix}-${datePart}-${timePart}`; const dirPath = path.join(this.debugDirectory, baseName); return { dirPath, baseName }; } private async writeDebugFile( debugSession: { dirPath: string; baseName: string } | null, filename: string, content: string, ): Promise { if (!debugSession) return; try { await fs.promises.mkdir(debugSession.dirPath, { recursive: true }); const filePath = path.join(debugSession.dirPath, filename); await fs.promises.writeFile(filePath, content, 'utf8'); } catch (err) { this.logger.warn(`Failed to write flyer debug file ${filename}: ${this.toErrorMessage(err)}`); } } private isRetryableError(err: unknown): boolean { if (err instanceof ServiceUnavailableException) return true; const message = this.toErrorMessage(err).toLowerCase(); return ( message.includes('timeout') || message.includes('timed out') || message.includes('rate limit') || message.includes('econnreset') || message.includes('socket hang up') ); } private toErrorMessage(err: unknown): string { if (err instanceof Error) return err.message; return String(err); } }