diff --git a/backend/src/common/utils/download-image.ts b/backend/src/common/utils/download-image.ts index 38523446..a234e1e6 100644 --- a/backend/src/common/utils/download-image.ts +++ b/backend/src/common/utils/download-image.ts @@ -20,19 +20,24 @@ export async function downloadAndOptimizeImage( sourceUrl: string, destDir: string, ): Promise { - // Protokollvalidering - if (!sourceUrl.startsWith('https://')) { - throw new Error('Bild-URL måste använda https://'); - } + const raw = sourceUrl.trim(); + const protocolNormalized = raw.startsWith('//') ? `https:${raw}` : raw; // SSRF: blockera privata hostnames - let hostname: string; + let parsedUrl: URL; try { - hostname = new URL(sourceUrl).hostname; + parsedUrl = new URL(protocolNormalized); } catch { throw new Error('Ogiltig bild-URL'); } + // Protokollvalidering + if (parsedUrl.protocol !== 'https:') { + throw new Error('Bild-URL måste använda https://'); + } + + const hostname = parsedUrl.hostname; + if (BLOCKED_HOSTNAMES.test(hostname)) { throw new Error('Bild-URL pekar på ett blockerat nätverk'); } @@ -42,7 +47,7 @@ export async function downloadAndOptimizeImage( const timeout = setTimeout(() => controller.abort(), 10_000); let response: Response; try { - response = await fetch(sourceUrl, { + response = await fetch(parsedUrl.toString(), { signal: controller.signal, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; RecipeApp/1.0)' }, }); diff --git a/backend/src/quick-import/parsers/generic.parser.ts b/backend/src/quick-import/parsers/generic.parser.ts index 779a71a0..d8227fc5 100644 --- a/backend/src/quick-import/parsers/generic.parser.ts +++ b/backend/src/quick-import/parsers/generic.parser.ts @@ -19,24 +19,22 @@ export class GenericRecipeParser extends RecipeParser { // Extrahera og:image för bildurl-fallback const ogImage = this.extractOgImage(html); - // Försöka extrahera JSON-LD recipe data - const jsonLdMatch = html.match( - /]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/i - ); + // Försöka extrahera JSON-LD recipe data (flera script-taggar är vanligt) + const jsonLdRegex = + /]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi; + let jsonLdMatch: RegExpExecArray | null; + while ((jsonLdMatch = jsonLdRegex.exec(html)) !== null) { + const rawJson = jsonLdMatch[1]?.trim(); + if (!rawJson) continue; - if (jsonLdMatch) { try { - const jsonData = JSON.parse(jsonLdMatch[1]); - const recipe = - jsonData['@type'] === 'Recipe' - ? jsonData - : jsonData['@graph']?.find((item: any) => item['@type'] === 'Recipe'); - + const parsedJson = JSON.parse(rawJson); + const recipe = this.findRecipeInJsonLd(parsedJson); if (recipe) { this.logger.log('JSON-LD data found'); return this.extractFromJsonLd(recipe, ogImage); } - } catch (err) { + } catch { this.logger.warn('JSON-LD parsing failed'); } } @@ -45,6 +43,37 @@ export class GenericRecipeParser extends RecipeParser { return this.parseFromHtml(html, ogImage); } + private findRecipeInJsonLd(jsonData: any): any { + if (!jsonData) return null; + + if (Array.isArray(jsonData)) { + for (const item of jsonData) { + const recipe = this.findRecipeInJsonLd(item); + if (recipe) return recipe; + } + return null; + } + + if (jsonData['@type'] === 'Recipe') { + return jsonData; + } + + if (Array.isArray(jsonData['@type']) && jsonData['@type'].includes('Recipe')) { + return jsonData; + } + + const graph = jsonData['@graph']; + if (Array.isArray(graph)) { + return graph.find( + (item: any) => + item?.['@type'] === 'Recipe' || + (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe')), + ) ?? null; + } + + return null; + } + private extractOgImage(html: string): string | undefined { const match = html.match(/]+property="og:image"[^>]+content="([^"]+)"/i) || html.match(/]+content="([^"]+)"[^>]+property="og:image"/i); diff --git a/backend/src/quick-import/parsers/ica.parser.ts b/backend/src/quick-import/parsers/ica.parser.ts index 80440aec..90d871f8 100644 --- a/backend/src/quick-import/parsers/ica.parser.ts +++ b/backend/src/quick-import/parsers/ica.parser.ts @@ -17,21 +17,17 @@ export class IcaRecipeParser extends RecipeParser { // Extrahera og:image för bildurl-fallback const ogImage = this.extractOgImage(html); - // Försöka extrahera JSON-LD recipe data (ICA använder detta) - const jsonLdMatch = html.match( - /]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/i - ); + // Försöka extrahera JSON-LD recipe data (ICA använder ofta flera script-taggar) + const jsonLdRegex = + /]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi; + let jsonLdMatch: RegExpExecArray | null; + while ((jsonLdMatch = jsonLdRegex.exec(html)) !== null) { + const rawJson = jsonLdMatch[1]?.trim(); + if (!rawJson) continue; - if (jsonLdMatch) { try { - const jsonData = JSON.parse(jsonLdMatch[1]); - - // Hitta recipe-objektet - const recipe = - jsonData['@type'] === 'Recipe' - ? jsonData - : jsonData['@graph']?.find((item: any) => item['@type'] === 'Recipe'); - + const parsedJson = JSON.parse(rawJson); + const recipe = this.findRecipeInJsonLd(parsedJson); if (recipe) { this.logger.log('JSON-LD recipe found'); return this.extractFromJsonLd(recipe, ogImage); @@ -46,6 +42,37 @@ export class IcaRecipeParser extends RecipeParser { return this.parseFromHtml(html, ogImage); } + private findRecipeInJsonLd(jsonData: any): any { + if (!jsonData) return null; + + if (Array.isArray(jsonData)) { + for (const item of jsonData) { + const recipe = this.findRecipeInJsonLd(item); + if (recipe) return recipe; + } + return null; + } + + if (jsonData['@type'] === 'Recipe') { + return jsonData; + } + + if (Array.isArray(jsonData['@type']) && jsonData['@type'].includes('Recipe')) { + return jsonData; + } + + const graph = jsonData['@graph']; + if (Array.isArray(graph)) { + return graph.find( + (item: any) => + item?.['@type'] === 'Recipe' || + (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe')), + ) ?? null; + } + + return null; + } + private extractOgImage(html: string): string | undefined { const match = html.match(/]+property="og:image"[^>]+content="([^"]+)"/i) || html.match(/]+content="([^"]+)"[^>]+property="og:image"/i); diff --git a/backend/src/quick-import/quick-import.service.ts b/backend/src/quick-import/quick-import.service.ts index e2d3809e..bc7a07b4 100644 --- a/backend/src/quick-import/quick-import.service.ts +++ b/backend/src/quick-import/quick-import.service.ts @@ -20,6 +20,7 @@ export interface QuickImportResult { markdown: string; source: 'ica' | 'pdf' | 'image' | 'other'; imageUrl?: string; + imageWarning?: string; } type UploadKind = 'pdf' | 'image'; @@ -317,12 +318,24 @@ export class QuickImportService { // Ladda ner och optimera bild om parser hittade en let imageUrl: string | undefined; + let imageWarning: string | undefined; if (recipe.imageUrl) { - try { - imageUrl = await downloadAndOptimizeImage(recipe.imageUrl, IMAGE_DEST_DIR); - this.logger.log(`Bild optimerad och sparad: ${imageUrl}`); - } catch (imgErr) { - this.logger.warn(`Kunde inte ladda ner bild: ${imgErr}`); + const normalizedImageUrl = this.normalizeImageUrl(recipe.imageUrl, url); + if (!normalizedImageUrl) { + imageWarning = 'Receptbild kunde inte tolkas till en giltig URL.'; + this.logger.warn( + `Kunde inte normalisera bild-URL: "${recipe.imageUrl}" (källsida: ${url})`, + ); + } else { + try { + imageUrl = await downloadAndOptimizeImage(normalizedImageUrl, IMAGE_DEST_DIR); + this.logger.log(`Bild optimerad och sparad: ${imageUrl}`); + } catch (imgErr) { + imageWarning = 'Receptbild kunde inte laddas ner.'; + this.logger.warn( + `Kunde inte ladda ner bild: ${imgErr} (källa: ${normalizedImageUrl})`, + ); + } } } @@ -330,6 +343,7 @@ export class QuickImportService { markdown, source, imageUrl, + imageWarning, }; } catch (err) { const message = err instanceof Error ? err.message : 'Okänt fel vid scraping'; @@ -340,6 +354,20 @@ export class QuickImportService { } } + private normalizeImageUrl(rawImageUrl: string, pageUrl: string): string | null { + const trimmed = rawImageUrl.trim(); + if (!trimmed) return null; + + const protocolNormalized = + trimmed.startsWith('//') ? `https:${trimmed}` : trimmed; + + try { + return new URL(protocolNormalized, pageUrl).toString(); + } catch { + return null; + } + } + /** * Konvertera receptobjekt till Markdown-format */