feat(import): enhance image URL handling and error reporting during recipe import

This commit is contained in:
Nils-Johan Gynther
2026-04-22 22:00:47 +02:00
parent 2edd6d003d
commit 28606d7abd
4 changed files with 126 additions and 37 deletions
+12 -7
View File
@@ -20,19 +20,24 @@ export async function downloadAndOptimizeImage(
sourceUrl: string, sourceUrl: string,
destDir: string, destDir: string,
): Promise<string> { ): Promise<string> {
// Protokollvalidering const raw = sourceUrl.trim();
if (!sourceUrl.startsWith('https://')) { const protocolNormalized = raw.startsWith('//') ? `https:${raw}` : raw;
throw new Error('Bild-URL måste använda https://');
}
// SSRF: blockera privata hostnames // SSRF: blockera privata hostnames
let hostname: string; let parsedUrl: URL;
try { try {
hostname = new URL(sourceUrl).hostname; parsedUrl = new URL(protocolNormalized);
} catch { } catch {
throw new Error('Ogiltig bild-URL'); throw new Error('Ogiltig bild-URL');
} }
// Protokollvalidering
if (parsedUrl.protocol !== 'https:') {
throw new Error('Bild-URL måste använda https://');
}
const hostname = parsedUrl.hostname;
if (BLOCKED_HOSTNAMES.test(hostname)) { if (BLOCKED_HOSTNAMES.test(hostname)) {
throw new Error('Bild-URL pekar på ett blockerat nätverk'); throw new Error('Bild-URL pekar på ett blockerat nätverk');
} }
@@ -42,7 +47,7 @@ export async function downloadAndOptimizeImage(
const timeout = setTimeout(() => controller.abort(), 10_000); const timeout = setTimeout(() => controller.abort(), 10_000);
let response: Response; let response: Response;
try { try {
response = await fetch(sourceUrl, { response = await fetch(parsedUrl.toString(), {
signal: controller.signal, signal: controller.signal,
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; RecipeApp/1.0)' }, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; RecipeApp/1.0)' },
}); });
@@ -19,24 +19,22 @@ export class GenericRecipeParser extends RecipeParser {
// Extrahera og:image för bildurl-fallback // Extrahera og:image för bildurl-fallback
const ogImage = this.extractOgImage(html); const ogImage = this.extractOgImage(html);
// Försöka extrahera JSON-LD recipe data // Försöka extrahera JSON-LD recipe data (flera script-taggar är vanligt)
const jsonLdMatch = html.match( const jsonLdRegex =
/<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/i /<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi;
); let jsonLdMatch: RegExpExecArray | null;
while ((jsonLdMatch = jsonLdRegex.exec(html)) !== null) {
const rawJson = jsonLdMatch[1]?.trim();
if (!rawJson) continue;
if (jsonLdMatch) {
try { try {
const jsonData = JSON.parse(jsonLdMatch[1]); const parsedJson = JSON.parse(rawJson);
const recipe = const recipe = this.findRecipeInJsonLd(parsedJson);
jsonData['@type'] === 'Recipe'
? jsonData
: jsonData['@graph']?.find((item: any) => item['@type'] === 'Recipe');
if (recipe) { if (recipe) {
this.logger.log('JSON-LD data found'); this.logger.log('JSON-LD data found');
return this.extractFromJsonLd(recipe, ogImage); return this.extractFromJsonLd(recipe, ogImage);
} }
} catch (err) { } catch {
this.logger.warn('JSON-LD parsing failed'); this.logger.warn('JSON-LD parsing failed');
} }
} }
@@ -45,6 +43,37 @@ export class GenericRecipeParser extends RecipeParser {
return this.parseFromHtml(html, ogImage); return this.parseFromHtml(html, ogImage);
} }
private findRecipeInJsonLd(jsonData: any): any {
if (!jsonData) return null;
if (Array.isArray(jsonData)) {
for (const item of jsonData) {
const recipe = this.findRecipeInJsonLd(item);
if (recipe) return recipe;
}
return null;
}
if (jsonData['@type'] === 'Recipe') {
return jsonData;
}
if (Array.isArray(jsonData['@type']) && jsonData['@type'].includes('Recipe')) {
return jsonData;
}
const graph = jsonData['@graph'];
if (Array.isArray(graph)) {
return graph.find(
(item: any) =>
item?.['@type'] === 'Recipe' ||
(Array.isArray(item?.['@type']) && item['@type'].includes('Recipe')),
) ?? null;
}
return null;
}
private extractOgImage(html: string): string | undefined { private extractOgImage(html: string): string | undefined {
const match = html.match(/<meta[^>]+property="og:image"[^>]+content="([^"]+)"/i) const match = html.match(/<meta[^>]+property="og:image"[^>]+content="([^"]+)"/i)
|| html.match(/<meta[^>]+content="([^"]+)"[^>]+property="og:image"/i); || html.match(/<meta[^>]+content="([^"]+)"[^>]+property="og:image"/i);
+40 -13
View File
@@ -17,21 +17,17 @@ export class IcaRecipeParser extends RecipeParser {
// Extrahera og:image för bildurl-fallback // Extrahera og:image för bildurl-fallback
const ogImage = this.extractOgImage(html); const ogImage = this.extractOgImage(html);
// Försöka extrahera JSON-LD recipe data (ICA använder detta) // Försöka extrahera JSON-LD recipe data (ICA använder ofta flera script-taggar)
const jsonLdMatch = html.match( const jsonLdRegex =
/<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/i /<script[^>]*type="application\/ld\+json"[^>]*>([\s\S]*?)<\/script>/gi;
); let jsonLdMatch: RegExpExecArray | null;
while ((jsonLdMatch = jsonLdRegex.exec(html)) !== null) {
const rawJson = jsonLdMatch[1]?.trim();
if (!rawJson) continue;
if (jsonLdMatch) {
try { try {
const jsonData = JSON.parse(jsonLdMatch[1]); const parsedJson = JSON.parse(rawJson);
const recipe = this.findRecipeInJsonLd(parsedJson);
// Hitta recipe-objektet
const recipe =
jsonData['@type'] === 'Recipe'
? jsonData
: jsonData['@graph']?.find((item: any) => item['@type'] === 'Recipe');
if (recipe) { if (recipe) {
this.logger.log('JSON-LD recipe found'); this.logger.log('JSON-LD recipe found');
return this.extractFromJsonLd(recipe, ogImage); return this.extractFromJsonLd(recipe, ogImage);
@@ -46,6 +42,37 @@ export class IcaRecipeParser extends RecipeParser {
return this.parseFromHtml(html, ogImage); return this.parseFromHtml(html, ogImage);
} }
private findRecipeInJsonLd(jsonData: any): any {
if (!jsonData) return null;
if (Array.isArray(jsonData)) {
for (const item of jsonData) {
const recipe = this.findRecipeInJsonLd(item);
if (recipe) return recipe;
}
return null;
}
if (jsonData['@type'] === 'Recipe') {
return jsonData;
}
if (Array.isArray(jsonData['@type']) && jsonData['@type'].includes('Recipe')) {
return jsonData;
}
const graph = jsonData['@graph'];
if (Array.isArray(graph)) {
return graph.find(
(item: any) =>
item?.['@type'] === 'Recipe' ||
(Array.isArray(item?.['@type']) && item['@type'].includes('Recipe')),
) ?? null;
}
return null;
}
private extractOgImage(html: string): string | undefined { private extractOgImage(html: string): string | undefined {
const match = html.match(/<meta[^>]+property="og:image"[^>]+content="([^"]+)"/i) const match = html.match(/<meta[^>]+property="og:image"[^>]+content="([^"]+)"/i)
|| html.match(/<meta[^>]+content="([^"]+)"[^>]+property="og:image"/i); || html.match(/<meta[^>]+content="([^"]+)"[^>]+property="og:image"/i);
@@ -20,6 +20,7 @@ export interface QuickImportResult {
markdown: string; markdown: string;
source: 'ica' | 'pdf' | 'image' | 'other'; source: 'ica' | 'pdf' | 'image' | 'other';
imageUrl?: string; imageUrl?: string;
imageWarning?: string;
} }
type UploadKind = 'pdf' | 'image'; type UploadKind = 'pdf' | 'image';
@@ -317,12 +318,24 @@ export class QuickImportService {
// Ladda ner och optimera bild om parser hittade en // Ladda ner och optimera bild om parser hittade en
let imageUrl: string | undefined; let imageUrl: string | undefined;
let imageWarning: string | undefined;
if (recipe.imageUrl) { if (recipe.imageUrl) {
const normalizedImageUrl = this.normalizeImageUrl(recipe.imageUrl, url);
if (!normalizedImageUrl) {
imageWarning = 'Receptbild kunde inte tolkas till en giltig URL.';
this.logger.warn(
`Kunde inte normalisera bild-URL: "${recipe.imageUrl}" (källsida: ${url})`,
);
} else {
try { try {
imageUrl = await downloadAndOptimizeImage(recipe.imageUrl, IMAGE_DEST_DIR); imageUrl = await downloadAndOptimizeImage(normalizedImageUrl, IMAGE_DEST_DIR);
this.logger.log(`Bild optimerad och sparad: ${imageUrl}`); this.logger.log(`Bild optimerad och sparad: ${imageUrl}`);
} catch (imgErr) { } catch (imgErr) {
this.logger.warn(`Kunde inte ladda ner bild: ${imgErr}`); imageWarning = 'Receptbild kunde inte laddas ner.';
this.logger.warn(
`Kunde inte ladda ner bild: ${imgErr} (källa: ${normalizedImageUrl})`,
);
}
} }
} }
@@ -330,6 +343,7 @@ export class QuickImportService {
markdown, markdown,
source, source,
imageUrl, imageUrl,
imageWarning,
}; };
} catch (err) { } catch (err) {
const message = err instanceof Error ? err.message : 'Okänt fel vid scraping'; const message = err instanceof Error ? err.message : 'Okänt fel vid scraping';
@@ -340,6 +354,20 @@ export class QuickImportService {
} }
} }
private normalizeImageUrl(rawImageUrl: string, pageUrl: string): string | null {
const trimmed = rawImageUrl.trim();
if (!trimmed) return null;
const protocolNormalized =
trimmed.startsWith('//') ? `https:${trimmed}` : trimmed;
try {
return new URL(protocolNormalized, pageUrl).toString();
} catch {
return null;
}
}
/** /**
* Konvertera receptobjekt till Markdown-format * Konvertera receptobjekt till Markdown-format
*/ */