From 6a927d40451af4d36df488e993d9b9574d87bce5 Mon Sep 17 00:00:00 2001 From: rasmusq Date: Tue, 25 Nov 2025 20:24:59 +0100 Subject: [PATCH] add: support for scraping more image links --- .../components/wishlist/WishlistItem.svelte | 3 +- src/routes/api/image-proxy/+server.ts | 44 +++++++++++ src/routes/api/scrape-images/+server.ts | 74 +++++++++++++++---- 3 files changed, 106 insertions(+), 15 deletions(-) create mode 100644 src/routes/api/image-proxy/+server.ts diff --git a/src/lib/components/wishlist/WishlistItem.svelte b/src/lib/components/wishlist/WishlistItem.svelte index 4944a4c..c2a2e44 100644 --- a/src/lib/components/wishlist/WishlistItem.svelte +++ b/src/lib/components/wishlist/WishlistItem.svelte @@ -66,9 +66,10 @@
{#if showImage && item.imageUrl} {item.title} e.currentTarget.src = item.imageUrl} /> {/if} diff --git a/src/routes/api/image-proxy/+server.ts b/src/routes/api/image-proxy/+server.ts new file mode 100644 index 0000000..6ce75f3 --- /dev/null +++ b/src/routes/api/image-proxy/+server.ts @@ -0,0 +1,44 @@ +import type { RequestHandler } from './$types'; + +export const GET: RequestHandler = async ({ url }) => { + const imageUrl = url.searchParams.get('url'); + + if (!imageUrl) { + return new Response('Image URL is required', { status: 400 }); + } + + try { + // Fetch the image with proper headers to avoid blocking + const response = await fetch(imageUrl, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': new URL(imageUrl).origin, + 'Sec-Fetch-Dest': 'image', + 'Sec-Fetch-Mode': 'no-cors', + 'Sec-Fetch-Site': 'cross-site' + } + }); + + if (!response.ok) { + return new Response('Failed to fetch image', { status: response.status }); + } + + const contentType = response.headers.get('content-type') || 'image/jpeg'; + const imageBuffer = await response.arrayBuffer(); + + // Return the image with appropriate headers + return new Response(imageBuffer, { + headers: { + 'Content-Type': contentType, + 'Cache-Control': 'public, max-age=86400', // Cache for 1 day + 'Access-Control-Allow-Origin': '*' + } + }); + } catch (error) { + console.error('Image proxy error:', error); + return new Response('Failed to proxy image', { status: 500 }); + } +}; diff --git a/src/routes/api/scrape-images/+server.ts b/src/routes/api/scrape-images/+server.ts index b075d70..a1400d3 100644 --- a/src/routes/api/scrape-images/+server.ts +++ b/src/routes/api/scrape-images/+server.ts @@ -11,7 +11,11 @@ export const POST: RequestHandler = async ({ request }) => { try { const response = await fetch(url, { headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://www.google.com/' } }); @@ -24,9 +28,13 @@ export const POST: RequestHandler = async ({ request }) => { const origin = baseUrl.origin; const imageUrls: string[] = []; - const imgRegex = /]+src="([^">]+)"/g; - const ogImageRegex = /]+property="og:image"[^>]+content="([^">]+)"/g; - const twitterImageRegex = /]+name="twitter:image"[^>]+content="([^">]+)"/g; + // Match various image source patterns + const imgRegex = /]+src=["']([^"'>]+)["']/gi; + const srcsetRegex = /]+srcset=["']([^"'>]+)["']/gi; + const dataSrcRegex = /]+data-src=["']([^"'>]+)["']/gi; + const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi; + const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi; + const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi; function toAbsoluteUrl(imgUrl: string): string { if (imgUrl.startsWith('http')) { @@ -43,32 +51,70 @@ export const POST: RequestHandler = async ({ request }) => { let match; + // Priority 1: OpenGraph and Twitter meta tags (usually the best product images) while ((match = ogImageRegex.exec(html)) !== null) { imageUrls.push(toAbsoluteUrl(match[1])); } while ((match = twitterImageRegex.exec(html)) !== null) { - imageUrls.push(toAbsoluteUrl(match[1])); + const url = toAbsoluteUrl(match[1]); + if (!imageUrls.includes(url)) { + imageUrls.push(url); + } } + // Priority 2: JSON-LD structured data (common for e-commerce) + while ((match = jsonLdRegex.exec(html)) !== null) { + const url = toAbsoluteUrl(match[1]); + if (!imageUrls.includes(url)) { + imageUrls.push(url); + } + } + + // Priority 3: data-src attributes (lazy loaded images) + while ((match = dataSrcRegex.exec(html)) !== null) { + const url = toAbsoluteUrl(match[1]); + if (!imageUrls.includes(url)) { + imageUrls.push(url); + } + } + + // Priority 4: srcset attributes (responsive images) + while ((match = srcsetRegex.exec(html)) !== null) { + const srcsetValue = match[1]; + // srcset can have multiple URLs with sizes, extract them + const srcsetUrls = srcsetValue.split(',').map((s) => { + const parts = s.trim().split(/\s+/); + return parts[0]; // Get the URL part before size descriptor + }); + for (const srcsetUrl of srcsetUrls) { + const url = toAbsoluteUrl(srcsetUrl); + if (!imageUrls.includes(url)) { + imageUrls.push(url); + } + } + } + + // Priority 5: Regular img src attributes while ((match = imgRegex.exec(html)) !== null) { - const imgUrl = match[1]; - const fullUrl = toAbsoluteUrl(imgUrl); - if (!imageUrls.includes(fullUrl)) { - imageUrls.push(fullUrl); + const url = toAbsoluteUrl(match[1]); + if (!imageUrls.includes(url)) { + imageUrls.push(url); } } const filteredImages = imageUrls.filter( (url) => - !url.includes('logo') && - !url.includes('icon') && - !url.includes('sprite') && + !url.toLowerCase().includes('logo') && + !url.toLowerCase().includes('icon') && + !url.toLowerCase().includes('sprite') && + !url.toLowerCase().includes('favicon') && !url.endsWith('.svg') && - url.length < 500 + url.length < 1000 && // Increased limit for modern CDN URLs + !url.includes('data:image') // Skip data URLs ); - return json({ images: filteredImages.slice(0, 20) }); + return json({ images: filteredImages.slice(0, 30) }); } catch (error) { return json({ error: 'Failed to scrape images' }, { status: 500 }); }