From 988c7ef6b5a22ef53f48962c3b5c51ac92d90abc Mon Sep 17 00:00:00 2001 From: Rasmus Krogh Udengaard Date: Thu, 5 Mar 2026 15:19:19 +0100 Subject: [PATCH] update: better image scraping --- src/routes/api/scrape-images/+server.ts | 150 ++++++++++++++++++------ 1 file changed, 115 insertions(+), 35 deletions(-) diff --git a/src/routes/api/scrape-images/+server.ts b/src/routes/api/scrape-images/+server.ts index a4515dd..bdd7105 100644 --- a/src/routes/api/scrape-images/+server.ts +++ b/src/routes/api/scrape-images/+server.ts @@ -35,7 +35,9 @@ export const POST: RequestHandler = async ({ request }) => { 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://www.google.com/' + 'Accept-Encoding': 'gzip, deflate, br', + 'Cache-Control': 'no-cache', + 'Pragma': 'no-cache' } }); @@ -48,13 +50,6 @@ export const POST: RequestHandler = async ({ request }) => { const origin = baseUrl.origin; const imageUrls: string[] = []; - // Match various image source patterns - const imgRegex = /]+src=["']([^"'>]+)["']/gi; - const srcsetRegex = /]+srcset=["']([^"'>]+)["']/gi; - const dataSrcRegex = /]+data-src=["']([^"'>]+)["']/gi; - const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi; - const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi; - const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi; function toAbsoluteUrl(imgUrl: string): string { if (imgUrl.startsWith('http')) { @@ -69,72 +64,157 @@ export const POST: RequestHandler = async ({ request }) => { return `${origin}/${imgUrl}`; } + function isLikelyProductImage(url: string): boolean { + const lower = url.toLowerCase(); + const badPatterns = [ + 'logo', 'icon', 'sprite', 'favicon', 'banner', 'footer', + 'header', 'background', 'pattern', 'placeholder', 'thumbnail-small', + 'btn', 'button', 'menu', 'nav', 'navigation', 'social', + 'instagram', 'facebook', 'twitter', 'linkedin', 'pinterest' + ]; + if (badPatterns.some(pattern => lower.includes(pattern))) { + return false; + } + if (url.endsWith('.svg')) { + return false; + } + if (lower.includes('data:image')) { + return false; + } + if (lower.includes('loading') || lower.includes('spinner') || lower.includes('skeleton')) { + return false; + } + return true; + } + let match; - // Priority 1: OpenGraph and Twitter meta tags (usually the best product images) + // Priority 1: OpenGraph and Twitter meta tags (main product image) + const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi; + const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi; + while ((match = ogImageRegex.exec(html)) !== null) { - imageUrls.push(toAbsoluteUrl(match[1])); + const url = toAbsoluteUrl(match[1]); + if (isLikelyProductImage(url) && !imageUrls.includes(url)) { + imageUrls.push(url); + } } while ((match = twitterImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); - if (!imageUrls.includes(url)) { + if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } - // Priority 2: JSON-LD structured data (common for e-commerce) + // Priority 2: Look for JSON-LD structured data (very common in modern e-commerce) + const jsonLdRegex = /]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; while ((match = jsonLdRegex.exec(html)) !== null) { - const url = toAbsoluteUrl(match[1]); - if (!imageUrls.includes(url)) { - imageUrls.push(url); + try { + const jsonStr = match[1]; + const jsonData = JSON.parse(jsonStr); + + function extractImages(obj: any, results: Set) { + if (!obj || typeof obj !== 'object') return; + if (Array.isArray(obj)) { + obj.forEach((item: any) => extractImages(item, results)); + } else { + for (const key in obj) { + if (key === 'image' || key === 'thumbnail' || key === 'url') { + const val = obj[key]; + if (typeof val === 'string') { + const url = toAbsoluteUrl(val); + if (isLikelyProductImage(url)) { + results.add(url); + } + } + if (Array.isArray(val)) { + val.forEach((item: any) => { + if (typeof item === 'string') { + const url = toAbsoluteUrl(item); + if (isLikelyProductImage(url)) { + results.add(url); + } + } + }); + } + } else if (typeof obj[key] === 'object') { + extractImages(obj[key], results); + } + } + } + } + + const jsonImages = new Set(); + extractImages(jsonData, jsonImages); + jsonImages.forEach(img => { + if (!imageUrls.includes(img)) { + imageUrls.push(img); + } + }); + } catch { + // JSON parsing failed, continue } } - // Priority 3: data-src attributes (lazy loaded images) - while ((match = dataSrcRegex.exec(html)) !== null) { + // Priority 3: Look for data-image attributes (common in React/SPA) + const dataImageRegex = /<[^>]+data-image=["']([^"'>]+)["']/gi; + while ((match = dataImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); - if (!imageUrls.includes(url)) { + if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 4: srcset attributes (responsive images) + const srcsetRegex = /]+srcset=["']([^"'>]+)["']/gi; while ((match = srcsetRegex.exec(html)) !== null) { const srcsetValue = match[1]; - // srcset can have multiple URLs with sizes, extract them const srcsetUrls = srcsetValue.split(',').map((s) => { const parts = s.trim().split(/\s+/); - return parts[0]; // Get the URL part before size descriptor + return parts[0]; }); for (const srcsetUrl of srcsetUrls) { const url = toAbsoluteUrl(srcsetUrl); - if (!imageUrls.includes(url)) { + if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } } - // Priority 5: Regular img src attributes - while ((match = imgRegex.exec(html)) !== null) { + // Priority 5: data-src attributes (lazy loaded) + const dataSrcRegex = /]+data-src=["']([^"'>]+)["']/gi; + while ((match = dataSrcRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); - if (!imageUrls.includes(url)) { + if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } - const filteredImages = imageUrls.filter( - (url) => - !url.toLowerCase().includes('logo') && - !url.toLowerCase().includes('icon') && - !url.toLowerCase().includes('sprite') && - !url.toLowerCase().includes('favicon') && - !url.endsWith('.svg') && - url.length < 1000 && // Increased limit for modern CDN URLs - !url.includes('data:image') // Skip data URLs - ); + // Priority 6: Regular img src attributes + const imgRegex = /]+src=["']([^"'>]+)["']/gi; + while ((match = imgRegex.exec(html)) !== null) { + const url = toAbsoluteUrl(match[1]); + if (isLikelyProductImage(url) && !imageUrls.includes(url)) { + imageUrls.push(url); + } + } - return json({ images: filteredImages.slice(0, 30) }); + // Priority 7: Background images in style attributes (common in some e-commerce) + const bgImageRegex = /background(-image)?:\s*url\(["']?([^"')]*)["']?/gi; + while ((match = bgImageRegex.exec(html)) !== null) { + const url = toAbsoluteUrl(match[1]); + if (isLikelyProductImage(url) && !imageUrls.includes(url) && !url.startsWith('data:')) { + imageUrls.push(url); + } + } + + // Final filtering: remove very long URLs and duplicates + const finalImages = [...new Set(imageUrls)].filter(url => { + return url.length < 2000 && isLikelyProductImage(url); + }); + + return json({ images: finalImages.slice(0, 30) }); } catch (error) { return json({ error: 'Failed to scrape images' }, { status: 500 }); }