add: support for scraping more image links

2025-11-25 20:24:59 +01:00
parent ddf430bbb7
commit 6a927d4045
3 changed files with 106 additions and 15 deletions
--- a/src/lib/components/wishlist/WishlistItem.svelte
+++ b/src/lib/components/wishlist/WishlistItem.svelte
@@ -66,9 +66,10 @@
 			<div class="flex flex-col md:flex-row gap-4 flex-1">
 				{#if showImage && item.imageUrl}
 					<img
-						src={item.imageUrl}
+						src="/api/image-proxy?url={encodeURIComponent(item.imageUrl)}"
 						alt={item.title}
 						class="w-full md:w-32 h-32 object-cover rounded-lg"
+						onerror={(e) => e.currentTarget.src = item.imageUrl}
 					/>
 				{/if}

--- a/src/routes/api/image-proxy/+server.ts
+++ b/src/routes/api/image-proxy/+server.ts
@@ -0,0 +1,44 @@
+import type { RequestHandler } from './$types';
+
+export const GET: RequestHandler = async ({ url }) => {
+	const imageUrl = url.searchParams.get('url');
+
+	if (!imageUrl) {
+		return new Response('Image URL is required', { status: 400 });
+	}
+
+	try {
+		// Fetch the image with proper headers to avoid blocking
+		const response = await fetch(imageUrl, {
+			headers: {
+				'User-Agent':
+					'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+				'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
+				'Accept-Language': 'en-US,en;q=0.9',
+				'Referer': new URL(imageUrl).origin,
+				'Sec-Fetch-Dest': 'image',
+				'Sec-Fetch-Mode': 'no-cors',
+				'Sec-Fetch-Site': 'cross-site'
+			}
+		});
+
+		if (!response.ok) {
+			return new Response('Failed to fetch image', { status: response.status });
+		}
+
+		const contentType = response.headers.get('content-type') || 'image/jpeg';
+		const imageBuffer = await response.arrayBuffer();
+
+		// Return the image with appropriate headers
+		return new Response(imageBuffer, {
+			headers: {
+				'Content-Type': contentType,
+				'Cache-Control': 'public, max-age=86400', // Cache for 1 day
+				'Access-Control-Allow-Origin': '*'
+			}
+		});
+	} catch (error) {
+		console.error('Image proxy error:', error);
+		return new Response('Failed to proxy image', { status: 500 });
+	}
+};
--- a/src/routes/api/scrape-images/+server.ts
+++ b/src/routes/api/scrape-images/+server.ts
@@ -11,7 +11,11 @@ export const POST: RequestHandler = async ({ request }) => {
 	try {
 		const response = await fetch(url, {
 			headers: {
-				'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+				'User-Agent':
+					'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+				'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+				'Accept-Language': 'en-US,en;q=0.9',
+				'Referer': 'https://www.google.com/'
 			}
 		});

@@ -24,9 +28,13 @@ export const POST: RequestHandler = async ({ request }) => {
 		const origin = baseUrl.origin;

 		const imageUrls: string[] = [];
-		const imgRegex = /<img[^>]+src="([^">]+)"/g;
-		const ogImageRegex = /<meta[^>]+property="og:image"[^>]+content="([^">]+)"/g;
-		const twitterImageRegex = /<meta[^>]+name="twitter:image"[^>]+content="([^">]+)"/g;
+		// Match various image source patterns
+		const imgRegex = /<img[^>]+src=["']([^"'>]+)["']/gi;
+		const srcsetRegex = /<img[^>]+srcset=["']([^"'>]+)["']/gi;
+		const dataSrcRegex = /<img[^>]+data-src=["']([^"'>]+)["']/gi;
+		const ogImageRegex = /<meta[^>]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
+		const twitterImageRegex = /<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
+		const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi;

 		function toAbsoluteUrl(imgUrl: string): string {
 			if (imgUrl.startsWith('http')) {
@@ -43,32 +51,70 @@ export const POST: RequestHandler = async ({ request }) => {

 		let match;

+		// Priority 1: OpenGraph and Twitter meta tags (usually the best product images)
 		while ((match = ogImageRegex.exec(html)) !== null) {
 			imageUrls.push(toAbsoluteUrl(match[1]));
 		}

 		while ((match = twitterImageRegex.exec(html)) !== null) {
-			imageUrls.push(toAbsoluteUrl(match[1]));
+			const url = toAbsoluteUrl(match[1]);
+			if (!imageUrls.includes(url)) {
+				imageUrls.push(url);
+			}
 		}

+		// Priority 2: JSON-LD structured data (common for e-commerce)
+		while ((match = jsonLdRegex.exec(html)) !== null) {
+			const url = toAbsoluteUrl(match[1]);
+			if (!imageUrls.includes(url)) {
+				imageUrls.push(url);
+			}
+		}
+
+		// Priority 3: data-src attributes (lazy loaded images)
+		while ((match = dataSrcRegex.exec(html)) !== null) {
+			const url = toAbsoluteUrl(match[1]);
+			if (!imageUrls.includes(url)) {
+				imageUrls.push(url);
+			}
+		}
+
+		// Priority 4: srcset attributes (responsive images)
+		while ((match = srcsetRegex.exec(html)) !== null) {
+			const srcsetValue = match[1];
+			// srcset can have multiple URLs with sizes, extract them
+			const srcsetUrls = srcsetValue.split(',').map((s) => {
+				const parts = s.trim().split(/\s+/);
+				return parts[0]; // Get the URL part before size descriptor
+			});
+			for (const srcsetUrl of srcsetUrls) {
+				const url = toAbsoluteUrl(srcsetUrl);
+				if (!imageUrls.includes(url)) {
+					imageUrls.push(url);
+				}
+			}
+		}
+
+		// Priority 5: Regular img src attributes
 		while ((match = imgRegex.exec(html)) !== null) {
-			const imgUrl = match[1];
-			const fullUrl = toAbsoluteUrl(imgUrl);
-			if (!imageUrls.includes(fullUrl)) {
-				imageUrls.push(fullUrl);
+			const url = toAbsoluteUrl(match[1]);
+			if (!imageUrls.includes(url)) {
+				imageUrls.push(url);
 			}
 		}

 		const filteredImages = imageUrls.filter(
 			(url) =>
-				!url.includes('logo') &&
-				!url.includes('icon') &&
-				!url.includes('sprite') &&
+				!url.toLowerCase().includes('logo') &&
+				!url.toLowerCase().includes('icon') &&
+				!url.toLowerCase().includes('sprite') &&
+				!url.toLowerCase().includes('favicon') &&
 				!url.endsWith('.svg') &&
-				url.length < 500
+				url.length < 1000 && // Increased limit for modern CDN URLs
+				!url.includes('data:image') // Skip data URLs
 		);

-		return json({ images: filteredImages.slice(0, 20) });
+		return json({ images: filteredImages.slice(0, 30) });
 	} catch (error) {
 		return json({ error: 'Failed to scrape images' }, { status: 500 });
 	}