diff --git a/backend/src/flyer-import/services/ai-flyer-parser.service.ts b/backend/src/flyer-import/services/ai-flyer-parser.service.ts index d3dcbaa3..1a888182 100644 --- a/backend/src/flyer-import/services/ai-flyer-parser.service.ts +++ b/backend/src/flyer-import/services/ai-flyer-parser.service.ts @@ -23,6 +23,9 @@ export class AiFlyerParserService { private readonly logger = new Logger(AiFlyerParserService.name); private readonly timeoutMs: number; private readonly maxRetries: number; + private readonly chunkSizeChars: number; + private readonly chunkOverlapChars: number; + private readonly maxChunks: number; private mistral: any; private apiKey: string; @@ -34,6 +37,9 @@ export class AiFlyerParserService { this.timeoutMs = this.readPositiveIntEnv('FLYER_AI_TIMEOUT_MS', 30_000); this.maxRetries = this.readPositiveIntEnv('FLYER_AI_RETRIES', 2); + this.chunkSizeChars = this.readPositiveIntEnv('FLYER_AI_CHUNK_SIZE_CHARS', 3_000); + this.chunkOverlapChars = this.readPositiveIntEnv('FLYER_AI_CHUNK_OVERLAP_CHARS', 300); + this.maxChunks = this.readPositiveIntEnv('FLYER_AI_MAX_CHUNKS', 8); } private async getClient(): Promise { @@ -56,56 +62,16 @@ export class AiFlyerParserService { try { const client = await this.getClient(); - const textWindows = [5000, 3000, 2000]; - const attempts = Math.max(1, Math.min(this.maxRetries + 1, textWindows.length)); - let lastError: unknown = null; + const chunks = this.splitIntoChunks(text); + this.logger.debug(`Parsing flyer text in ${chunks.length} chunk(s)`); - for (let i = 0; i < attempts; i++) { - const window = textWindows[i]; - const prompt = this.buildPrompt(text, window); - - try { - this.logger.debug( - `Sending request to Mistral Tiny (attempt ${i + 1}/${attempts}, timeout=${this.timeoutMs}ms, textWindow=${window})`, - ); - - const response = await this.withTimeout( - client.chat({ - model: 'mistral-tiny', - messages: [{ role: 'user', content: prompt }], - temperature: 0.1, - }), - this.timeoutMs, - 'Mistral-anrop timeout', - ); - - const content = response.choices?.[0]?.message?.content; - if (!content) { - throw new BadRequestException('Tomt svar från AI-modellen.'); - } - - this.logger.debug(`Mistral response length: ${content.length} chars`); - - const jsonString = this.sanitizeJsonResponse(content); - const items = JSON.parse(jsonString) as Array>; - - if (!Array.isArray(items)) { - throw new BadRequestException('AI returnerade inte en JSON-array.'); - } - - return items.map((item, idx) => this.normalizeAiItem(item, idx)); - } catch (attemptErr) { - lastError = attemptErr; - if (!this.isRetryableError(attemptErr) || i === attempts - 1) { - throw attemptErr; - } - this.logger.warn( - `Mistral attempt ${i + 1} failed (${this.toErrorMessage(attemptErr)}). Retrying with shorter text window.`, - ); - } + const allItems: AiFlyerParseResult[] = []; + for (let i = 0; i < chunks.length; i++) { + const chunkItems = await this.parseChunkWithRetry(client, chunks[i], i + 1, chunks.length); + allItems.push(...chunkItems); } - throw lastError instanceof Error ? lastError : new ServiceUnavailableException('AI-anrop misslyckades'); + return this.dedupeItems(allItems); } catch (err) { if (err instanceof SyntaxError) { this.logger.error(`JSON parse error: ${String(err)}`); @@ -255,6 +221,106 @@ Exempel på utdata: .trim(); } + private splitIntoChunks(text: string): string[] { + const normalized = text.replace(/\r\n/g, '\n').trim(); + if (!normalized) return []; + + if (normalized.length <= this.chunkSizeChars) { + return [normalized]; + } + + const chunks: string[] = []; + let start = 0; + while (start < normalized.length && chunks.length < this.maxChunks) { + const end = Math.min(start + this.chunkSizeChars, normalized.length); + const chunk = normalized.slice(start, end).trim(); + if (chunk) chunks.push(chunk); + if (end >= normalized.length) break; + start = Math.max(0, end - this.chunkOverlapChars); + } + + return chunks; + } + + private async parseChunkWithRetry( + client: any, + chunkText: string, + chunkIndex: number, + totalChunks: number, + ): Promise { + const textWindows = [3000, 2200, 1600]; + const attempts = Math.max(1, Math.min(this.maxRetries + 1, textWindows.length)); + let lastError: unknown = null; + + for (let i = 0; i < attempts; i++) { + const window = textWindows[i]; + const prompt = this.buildPrompt(chunkText, window); + + try { + this.logger.debug( + `Sending request to Mistral Tiny (chunk ${chunkIndex}/${totalChunks}, attempt ${i + 1}/${attempts}, timeout=${this.timeoutMs}ms, textWindow=${window})`, + ); + + const response = await this.withTimeout( + client.chat({ + model: 'mistral-tiny', + messages: [{ role: 'user', content: prompt }], + temperature: 0.1, + }), + this.timeoutMs, + 'Mistral-anrop timeout', + ); + + const content = response.choices?.[0]?.message?.content; + if (!content) { + throw new BadRequestException('Tomt svar från AI-modellen.'); + } + + this.logger.debug(`Mistral response length: ${content.length} chars`); + + const jsonString = this.sanitizeJsonResponse(content); + const items = JSON.parse(jsonString) as Array>; + + if (!Array.isArray(items)) { + throw new BadRequestException('AI returnerade inte en JSON-array.'); + } + + return items.map((item, idx) => this.normalizeAiItem(item, idx)); + } catch (attemptErr) { + lastError = attemptErr; + if (!this.isRetryableError(attemptErr) || i === attempts - 1) { + throw attemptErr; + } + this.logger.warn( + `Mistral chunk ${chunkIndex}/${totalChunks} attempt ${i + 1} failed (${this.toErrorMessage(attemptErr)}). Retrying with shorter text window.`, + ); + } + } + + throw lastError instanceof Error + ? lastError + : new ServiceUnavailableException('AI-anrop misslyckades'); + } + + private dedupeItems(items: AiFlyerParseResult[]): AiFlyerParseResult[] { + const seen = new Set(); + const deduped: AiFlyerParseResult[] = []; + + for (const item of items) { + const key = [ + item.normalizedName, + item.price ?? '', + item.priceUnit ?? '', + item.offerText ?? '', + ].join('|'); + if (seen.has(key)) continue; + seen.add(key); + deduped.push(item); + } + + return deduped; + } + private readPositiveIntEnv(key: string, fallback: number): number { const raw = process.env[key]; if (!raw) return fallback;