Files
recipe-app/backend/src/flyer-import/services/ai-flyer-parser.service.ts
T
Nils-Johan Gynther d9f992ca9a
Test Suite / backend-pr-quick (push) Has been skipped
Test Suite / quick-import-pr-quick (push) Has been skipped
Test Suite / backend-full (push) Successful in 4m21s
Test Suite / flutter-quality (push) Failing after 1m38s
feat(ai): enhance AI trace warnings and reason codes system
- Added structured warning system with `AdminAiWarning` type in backend and Flutter
- Implemented detailed reason descriptors with `FlyerReasonDescriptor` for parse and match operations
- Added `legacyWarnings` field to maintain backward compatibility
- Enhanced AI trace service to collect and format warnings with item-level context
- Updated flyer import services to include detailed reason descriptions in responses
- Added Swedish diacritic preservation for cheese variants (Prästost, Herrgårdsost, Grevéost)
- Implemented UTF-8 content validation for AI responses
- Added new reason code definitions in `reason-codes.ts`
- Updated Flutter UI to display structured warnings with severity indicators
- Added error report generation and copy functionality in admin panel
- Added comprehensive test coverage for new warning system and cheese normalization

BREAKING CHANGE: AI trace warnings are now structured objects instead of simple strings
2026-05-23 21:11:46 +02:00

635 lines
20 KiB
TypeScript

import {
BadRequestException,
Injectable,
Logger,
ServiceUnavailableException,
} from '@nestjs/common';
import * as fs from 'fs';
import * as path from 'path';
export interface AiFlyerParseResult {
rawName: string;
normalizedName: string;
brand: string | null;
category: string | null;
price: number | null;
priceUnit: string | null;
comparisonPrice: number | null;
comparisonUnit: string | null;
weight: string | null;
bundleWeight: string | null;
isBundle: boolean;
bundleItems: string[];
offerText: string | null;
confidence: number;
reasonCodes: string[];
}
export interface AiFlyerParseTrace {
prompt: string | null;
rawOutput: string | null;
chunkCount: number;
retryCount: number;
}
@Injectable()
export class AiFlyerParserService {
private readonly logger = new Logger(AiFlyerParserService.name);
private readonly timeoutMs: number;
private readonly maxRetries: number;
private readonly chunkSizeChars: number;
private readonly chunkOverlapChars: number;
private readonly maxChunks: number;
private readonly debugEnabled: boolean;
private readonly debugDirectory: string;
private mistral: any;
private apiKey: string;
constructor() {
this.apiKey = process.env.MISTRAL_API_KEY ?? '';
if (!this.apiKey) {
throw new Error('MISTRAL_API_KEY environment variable not set');
}
this.timeoutMs = this.readPositiveIntEnv('FLYER_AI_TIMEOUT_MS', 30_000);
this.maxRetries = this.readPositiveIntEnv('FLYER_AI_RETRIES', 2);
this.chunkSizeChars = this.readPositiveIntEnv('FLYER_AI_CHUNK_SIZE_CHARS', 3_000);
this.chunkOverlapChars = this.readPositiveIntEnv('FLYER_AI_CHUNK_OVERLAP_CHARS', 300);
this.maxChunks = this.readPositiveIntEnv('FLYER_AI_MAX_CHUNKS', 8);
this.debugEnabled = this.readBooleanEnv('FLYER_AI_DEBUG', false);
this.debugDirectory = process.env.FLYER_AI_DEBUG_DIR?.trim() || path.join(process.cwd(), 'debug');
}
private async getClient(): Promise<any> {
if (this.mistral) return this.mistral;
const mistralModule = await import('@mistralai/mistralai');
this.mistral = new mistralModule.default(this.apiKey);
return this.mistral;
}
/**
* Skickar flyer-text till mistral-8b-2512 för strukturerad extraktion.
*
* @param text Text från flyern (från pdf-parse eller OCR)
* @returns Array av parsade produkter
*/
async parseWithAI(text: string): Promise<{ items: AiFlyerParseResult[]; trace: AiFlyerParseTrace }> {
if (!text || text.trim().length === 0) {
throw new BadRequestException('Flyer-texten är tom. Kan inte fortsätta.');
}
const debugSession = this.createDebugSession('AI-flyerimporter');
try {
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-input.txt`,
text,
);
}
const client = await this.getClient();
const chunks = this.splitIntoChunks(text);
this.logger.debug(`Parsing flyer text in ${chunks.length} chunk(s)`);
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-chunks.json`,
JSON.stringify(chunks, null, 2),
);
}
const allItems: AiFlyerParseResult[] = [];
const prompts: string[] = [];
const rawResponses: string[] = [];
let retryCount = 0;
for (let i = 0; i < chunks.length; i++) {
const chunkResult = await this.parseChunkWithRetry(
client,
chunks[i],
i + 1,
chunks.length,
debugSession,
);
allItems.push(...chunkResult.items);
prompts.push(chunkResult.prompt);
rawResponses.push(chunkResult.rawOutput);
retryCount += Math.max(0, chunkResult.attemptsUsed - 1);
}
const deduped = this.dedupeItems(allItems);
const trace: AiFlyerParseTrace = {
prompt: prompts.length > 0 ? prompts.join('\n\n-----\n\n') : null,
rawOutput: rawResponses.length > 0 ? rawResponses.join('\n\n-----\n\n') : null,
chunkCount: chunks.length,
retryCount,
};
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-result.json`,
JSON.stringify(deduped, null, 2),
);
}
return { items: deduped, trace };
} catch (err) {
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-error.txt`,
this.toErrorMessage(err),
);
}
if (err instanceof SyntaxError) {
this.logger.error(`JSON parse error: ${String(err)}`);
throw new BadRequestException('AI returnerade ogiltigt JSON. Försök igen.');
}
if (err instanceof BadRequestException) {
throw err;
}
if (err instanceof ServiceUnavailableException) {
throw err;
}
this.logger.error(`AI parsing failed: ${String(err)}`);
throw new ServiceUnavailableException('AI-tjänsten är inte tillgänglig just nu.');
}
}
private async withTimeout<T>(
promise: Promise<T>,
timeoutMs: number,
timeoutMessage: string,
): Promise<T> {
let timeoutHandle: ReturnType<typeof setTimeout> | null = null;
const timeoutPromise = new Promise<never>((_, reject) => {
timeoutHandle = setTimeout(() => {
reject(new ServiceUnavailableException(timeoutMessage));
}, timeoutMs);
});
try {
return await Promise.race([promise, timeoutPromise]);
} finally {
if (timeoutHandle) clearTimeout(timeoutHandle);
}
}
/**
* Bygger systemprompten för Mistral.
*/
private buildPrompt(text: string, maxTextLength: number): string {
const truncatedText = text.length > maxTextLength ? text.substring(0, maxTextLength) : text;
return `Du tolkar svenska matvaruflyers och ska returnera ENDAST en JSON-array.
Returnera objekt med exakt dessa fält:
- name: string (produkttitel)
- brand: string | null
- category: string | null
- isBundle: boolean
- weight: string | null (vikt/storlek for en enskild produkt)
- bundleWeight: string | null (totalvikt for hela kombipaketet)
- bundleItems: string[] (ingående produkter i paketet, tom array om ej bundle)
- price: number | null
- comparisonPrice: number | null
- unit: string | null (enhet for jamforpris, t.ex. kg/l/st)
- offer: string[]
Arbetssatt (viktigt):
Steg A) Identifiera om texten ar en gruppannons med flera varianter + gemensamma attribut.
Steg B) Returnera en post per faktisk produktvariant med arvd metadata.
Regler:
1) Vanlig produkt (ej bundle): isBundle=false, bundleWeight=null, bundleItems=[].
2) Kombipaket/bundle: isBundle=true, name ska vara paketets huvudnamn, bundleWeight totalvikt.
3) For bundle ska bundleItems innehalla de ingaende produkterna, t.ex. ["Chumlax 3x100g", "Alaska pollock 3x100g"].
4) price ar priset for hela forpackningen. comparisonPrice ar jamforpris som tal ("83:17" -> 83.17).
5) offer innehaller kampanjtext som "Max 10 kop/hushall".
6) Om en rubrik/lista innehaller flera kommaseparerade namn och efterfoljande rad/rader innehaller gemensam brand, vikt, pris eller kampanjvillkor: expandera till separata objekt (en per namn) och arv all gemensam metadata.
7) Tillämpa samma split-regel generellt for liknande tillbud (inte bara ost), nar listan tydligt representerar produktvarianter/smaker/sorter.
8) Splitta INTE om listan snarare ar ingredienser, avdelningar, eller otydlig marknadsforing utan tydlig produktvariant.
9) Specialregel ost: namn som PRAST/HERRGARD/GREVE ska normaliseras till Prästost/Herrgårdsost/Grevéost.
10) Om texten innehaller "ARLA KO" ska brand vara exakt "Arla Ko".
11) For ovan ostsorter ska category vara "Hardost".
12) Behåll svenska diakritiska tecken (ä, å, ö, é) i produktnamn. Returnera "Prästost", "Herrgårdsost", "Grevéost" - inte ASCII-versioner.
13) Returnera aldrig extra nycklar, text, markdown eller forklaringar utanfor JSON-arrayen.
Exempel bundle utdata:
[
{
"name": "Kaptenens Favoriter",
"brand": "Kapten Royal",
"category": "Fisk",
"isBundle": true,
"weight": null,
"bundleWeight": "600g",
"bundleItems": ["Chumlax 3x100g", "Alaska pollock 3x100g"],
"price": 49.90,
"comparisonPrice": 83.17,
"unit": "kg",
"offer": ["Max 10 kop/hushall"]
}
]
Exempel enkel produkt utdata:
[
{
"name": "ICA Basic Mjolk 1,5%",
"brand": "ICA Basic",
"category": "Mejeri",
"isBundle": false,
"weight": "1l",
"bundleWeight": null,
"bundleItems": [],
"price": 12.90,
"comparisonPrice": 12.90,
"unit": "l",
"offer": []
}
]
Exempel gruppannons med varianter (ska splittas):
Input-idé: "PRAST, HERRGARD, GREVE" + "ARLA KO" + gemensam vikt/pris.
Output-idé:
[
{
"name": "Prästost",
"brand": "Arla Ko",
"category": "Hardost",
"isBundle": false,
"weight": "667g",
"bundleWeight": null,
"bundleItems": [],
"price": null,
"comparisonPrice": 79.90,
"unit": "kg",
"offer": ["Max 3 forp/hushall"]
},
{
"name": "Herrgårdsost",
"brand": "Arla Ko",
"category": "Hardost",
"isBundle": false,
"weight": "667g",
"bundleWeight": null,
"bundleItems": [],
"price": null,
"comparisonPrice": 79.90,
"unit": "kg",
"offer": ["Max 3 forp/hushall"]
}
]
Exempel negativt fall (ska INTE splittas):
Input-idé: "Ingredienser: tomat, lok, vitlok".
Output-idé: en produktpost (ingen variant-expansion).
Text att tolka:
${truncatedText}`;
}
/**
* Rensa AI-svaret för att kunna parse som JSON.
*/
private sanitizeJsonResponse(content: string): string {
let cleaned = content.replace(/```json\n?/g, '').replace(/```\n?/g, '');
cleaned = cleaned.trim();
const jsonMatch = cleaned.match(/\[[\s\S]*\]/);
if (jsonMatch) {
cleaned = jsonMatch[0];
}
return cleaned;
}
/**
* Normaliserar och typkonverterar AI-item till vårt format.
*/
private normalizeAiItem(item: Record<string, unknown>, index: number): AiFlyerParseResult {
const toNumber = (val: unknown): number | null => {
if (typeof val === 'number') return val;
if (typeof val === 'string') {
const parsed = parseFloat(val.replace(',', '.'));
return isFinite(parsed) ? parsed : null;
}
return null;
};
const toString = (val: unknown): string | null => {
if (typeof val === 'string') return val.trim() || null;
return null;
};
const toArray = (val: unknown): string[] => {
if (Array.isArray(val)) {
return val.map(v => String(v)).filter(v => v.trim());
}
return [];
};
const rawName = toString(item.name) || `Produkt ${index + 1}`;
const normalizedName = this.normalizeName(rawName);
return {
rawName,
normalizedName,
brand: toString(item.brand),
category: toString(item.category),
price: toNumber(item.price),
priceUnit: toString(item.unit),
comparisonPrice: toNumber(item.comparisonPrice),
comparisonUnit: toString(item.comparisonUnit),
weight: toString(item.weight),
bundleWeight: toString(item.bundleWeight),
isBundle: Boolean(item.isBundle),
bundleItems: toArray(item.bundleItems),
offerText: toString(item.offer) || (toArray(item.offer).join(' ') || null),
confidence: 0.85,
reasonCodes: ['ai_parsed'],
};
}
private normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/[^a-zåäöé0-9\s]/g, '')
.replace(/\s+/g, ' ')
.trim();
}
private splitIntoChunks(text: string): string[] {
const normalized = text.replace(/\r\n/g, '\n').trim();
if (!normalized) return [];
if (normalized.length <= this.chunkSizeChars) {
return [normalized];
}
const chunks: string[] = [];
let start = 0;
while (start < normalized.length && chunks.length < this.maxChunks) {
const end = Math.min(start + this.chunkSizeChars, normalized.length);
const chunk = normalized.slice(start, end).trim();
if (chunk) chunks.push(chunk);
if (end >= normalized.length) break;
start = Math.max(0, end - this.chunkOverlapChars);
}
return chunks;
}
private async parseChunkWithRetry(
client: any,
chunkText: string,
chunkIndex: number,
totalChunks: number,
debugSession: { dirPath: string; baseName: string } | null,
): Promise<{
items: AiFlyerParseResult[];
prompt: string;
rawOutput: string;
attemptsUsed: number;
}> {
const textWindows = [3000, 2200, 1600];
const attempts = Math.max(1, Math.min(this.maxRetries + 1, textWindows.length));
let lastError: unknown = null;
for (let i = 0; i < attempts; i++) {
const window = textWindows[i];
const prompt = this.buildPrompt(chunkText, window);
try {
this.logger.debug(
`Sending request to Mistral Tiny (chunk ${chunkIndex}/${totalChunks}, attempt ${i + 1}/${attempts}, timeout=${this.timeoutMs}ms, textWindow=${window})`,
);
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-chunk-${chunkIndex}-attempt-${i + 1}-prompt.txt`,
prompt,
);
}
const response = await this.withTimeout<any>(
client.chat({
model: 'ministral-8b-2512',
messages: [{ role: 'user', content: prompt }],
temperature: 0.1,
}),
this.timeoutMs,
'Mistral-anrop timeout',
);
const content = this.ensureUtf8Content(response.choices?.[0]?.message?.content);
if (!content) {
throw new BadRequestException('Tomt svar från AI-modellen.');
}
this.logger.debug(`Mistral response length: ${content.length} chars`);
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-chunk-${chunkIndex}-attempt-${i + 1}-response.txt`,
String(content),
);
}
const jsonString = this.sanitizeJsonResponse(content);
const items = JSON.parse(jsonString) as Array<Record<string, unknown>>;
if (!Array.isArray(items)) {
throw new BadRequestException('AI returnerade inte en JSON-array.');
}
return {
items: items.map((aiItem, idx) => this.normalizeAiItem(aiItem, idx)),
prompt,
rawOutput: String(content),
attemptsUsed: i + 1,
};
} catch (attemptErr) {
lastError = attemptErr;
if (debugSession) {
await this.writeDebugFile(
debugSession,
`${debugSession.baseName}-chunk-${chunkIndex}-attempt-${i + 1}-error.txt`,
this.toErrorMessage(attemptErr),
);
}
if (!this.isRetryableError(attemptErr) || i === attempts - 1) {
throw attemptErr;
}
this.logger.warn(
`Mistral chunk ${chunkIndex}/${totalChunks} attempt ${i + 1} failed (${this.toErrorMessage(attemptErr)}). Retrying with shorter text window.`,
);
}
}
throw lastError instanceof Error
? lastError
: new ServiceUnavailableException('AI-anrop misslyckades');
}
private dedupeItems(items: AiFlyerParseResult[]): AiFlyerParseResult[] {
const seen = new Set<string>();
const deduped: AiFlyerParseResult[] = [];
for (const item of items) {
const normalizedName = item.normalizedName.trim();
const normalizedBrand = (item.brand ?? '').trim().toLowerCase();
const normalizedPrice = item.price == null ? '' : Number(item.price).toFixed(2);
const normalizedPriceUnit = (item.priceUnit ?? '').trim().toLowerCase();
const normalizedComparisonPrice =
item.comparisonPrice == null ? '' : Number(item.comparisonPrice).toFixed(2);
const normalizedComparisonUnit = (item.comparisonUnit ?? '').trim().toLowerCase();
const offerSignature = this.offerSignature(item.offerText);
const key = [
normalizedName,
normalizedBrand,
normalizedPrice,
normalizedPriceUnit,
normalizedComparisonPrice,
normalizedComparisonUnit,
offerSignature,
item.isBundle ? '1' : '0',
].join('|');
if (seen.has(key)) continue;
seen.add(key);
deduped.push(item);
}
return deduped;
}
private offerSignature(offerText: string | null | undefined): string {
if (!offerText || offerText.trim().length === 0) return '';
const normalized = offerText
.toLowerCase()
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.replace(/[^a-z0-9\s]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
if (!normalized) return '';
const hasCampaignMarkers =
/(max|hogst|begransat|hushall|kund|kop|for|betala|ta)/.test(normalized)
|| /(\d+\s*for\s*\d+)/.test(normalized)
|| /(ta\s*\d+\s*betala\s*for\s*\d+)/.test(normalized);
return hasCampaignMarkers ? normalized : '';
}
private ensureUtf8Content(content: unknown): string {
const asString = this.flattenContent(content);
if (!asString) return '';
const utf8 = Buffer.from(asString, 'utf8').toString('utf8');
if (this.debugEnabled && (asString.includes('\uFFFD') || utf8.includes('\uFFFD'))) {
const hex = Buffer.from(asString, 'utf8').toString('hex').slice(0, 256);
this.logger.debug(`Potential encoding issue in AI response (hex preview): ${hex}`);
}
return utf8;
}
private flattenContent(content: unknown): string {
if (typeof content === 'string') {
return content;
}
if (Array.isArray(content)) {
return content
.map((part) => {
if (typeof part === 'string') return part;
if (part && typeof part === 'object' && 'text' in part) {
const text = (part as { text?: unknown }).text;
return typeof text === 'string' ? text : '';
}
return '';
})
.join('');
}
if (content == null) {
return '';
}
return String(content);
}
private readPositiveIntEnv(key: string, fallback: number): number {
const raw = process.env[key];
if (!raw) return fallback;
const parsed = Number.parseInt(raw, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
this.logger.warn(`Invalid ${key} value: "${raw}". Falling back to ${fallback}.`);
return fallback;
}
return parsed;
}
private readBooleanEnv(key: string, fallback: boolean): boolean {
const raw = process.env[key];
if (!raw) return fallback;
return ['1', 'true', 'yes', 'on'].includes(raw.trim().toLowerCase());
}
private createDebugSession(prefix: string): { dirPath: string; baseName: string } | null {
if (!this.debugEnabled) return null;
const now = new Date();
const y = String(now.getFullYear()).slice(-2);
const m = String(now.getMonth() + 1).padStart(2, '0');
const d = String(now.getDate()).padStart(2, '0');
const hh = String(now.getHours()).padStart(2, '0');
const mm = String(now.getMinutes()).padStart(2, '0');
const ss = String(now.getSeconds()).padStart(2, '0');
const datePart = `${y}${m}${d}`;
const timePart = `${hh}${mm}${ss}`;
const baseName = `${prefix}-${datePart}-${timePart}`;
const dirPath = path.join(this.debugDirectory, baseName);
return { dirPath, baseName };
}
private async writeDebugFile(
debugSession: { dirPath: string; baseName: string } | null,
filename: string,
content: string,
): Promise<void> {
if (!debugSession) return;
try {
await fs.promises.mkdir(debugSession.dirPath, { recursive: true });
const filePath = path.join(debugSession.dirPath, filename);
await fs.promises.writeFile(filePath, content, 'utf8');
} catch (err) {
this.logger.warn(`Failed to write flyer debug file ${filename}: ${this.toErrorMessage(err)}`);
}
}
private isRetryableError(err: unknown): boolean {
if (err instanceof ServiceUnavailableException) return true;
const message = this.toErrorMessage(err).toLowerCase();
return (
message.includes('timeout') ||
message.includes('timed out') ||
message.includes('rate limit') ||
message.includes('econnreset') ||
message.includes('socket hang up')
);
}
private toErrorMessage(err: unknown): string {
if (err instanceof Error) return err.message;
return String(err);
}
}