update: better image scraping

2026-03-05 15:19:19 +01:00
parent 9f8ae9a972
commit 988c7ef6b5
1 changed files with 115 additions and 35 deletions
@@ -35,7 +35,9 @@ export const POST: RequestHandler = async ({ request }) => {
 					'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
 				'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 				'Accept-Language': 'en-US,en;q=0.9',
-				'Referer': 'https://www.google.com/'
+				'Accept-Encoding': 'gzip, deflate, br',
+				'Cache-Control': 'no-cache',
+				'Pragma': 'no-cache'
 			}
 		});

@@ -48,13 +50,6 @@ export const POST: RequestHandler = async ({ request }) => {
 		const origin = baseUrl.origin;

 		const imageUrls: string[] = [];
-		// Match various image source patterns
-		const imgRegex = /<img[^>]+src=["']([^"'>]+)["']/gi;
-		const srcsetRegex = /<img[^>]+srcset=["']([^"'>]+)["']/gi;
-		const dataSrcRegex = /<img[^>]+data-src=["']([^"'>]+)["']/gi;
-		const ogImageRegex = /<meta[^>]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
-		const twitterImageRegex = /<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
-		const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi;

 		function toAbsoluteUrl(imgUrl: string): string {
 			if (imgUrl.startsWith('http')) {
@@ -69,72 +64,157 @@ export const POST: RequestHandler = async ({ request }) => {
 			return `${origin}/${imgUrl}`;
 		}

+		function isLikelyProductImage(url: string): boolean {
+			const lower = url.toLowerCase();
+			const badPatterns = [
+				'logo', 'icon', 'sprite', 'favicon', 'banner', 'footer',
+				'header', 'background', 'pattern', 'placeholder', 'thumbnail-small',
+				'btn', 'button', 'menu', 'nav', 'navigation', 'social',
+				'instagram', 'facebook', 'twitter', 'linkedin', 'pinterest'
+			];
+			if (badPatterns.some(pattern => lower.includes(pattern))) {
+				return false;
+			}
+			if (url.endsWith('.svg')) {
+				return false;
+			}
+			if (lower.includes('data:image')) {
+				return false;
+			}
+			if (lower.includes('loading') || lower.includes('spinner') || lower.includes('skeleton')) {
+				return false;
+			}
+			return true;
+		}
+
 		let match;

-		// Priority 1: OpenGraph and Twitter meta tags (usually the best product images)
+		// Priority 1: OpenGraph and Twitter meta tags (main product image)
+		const ogImageRegex = /<meta[^>]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
+		const twitterImageRegex = /<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
+
 		while ((match = ogImageRegex.exec(html)) !== null) {
-			imageUrls.push(toAbsoluteUrl(match[1]));
+			const url = toAbsoluteUrl(match[1]);
+			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
+				imageUrls.push(url);
+			}
 		}

 		while ((match = twitterImageRegex.exec(html)) !== null) {
 			const url = toAbsoluteUrl(match[1]);
-			if (!imageUrls.includes(url)) {
+			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
 				imageUrls.push(url);
 			}
 		}

-		// Priority 2: JSON-LD structured data (common for e-commerce)
+		// Priority 2: Look for JSON-LD structured data (very common in modern e-commerce)
+		const jsonLdRegex = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
 		while ((match = jsonLdRegex.exec(html)) !== null) {
-			const url = toAbsoluteUrl(match[1]);
-			if (!imageUrls.includes(url)) {
-				imageUrls.push(url);
+			try {
+				const jsonStr = match[1];
+				const jsonData = JSON.parse(jsonStr);
+
+				function extractImages(obj: any, results: Set<string>) {
+					if (!obj || typeof obj !== 'object') return;
+					if (Array.isArray(obj)) {
+						obj.forEach((item: any) => extractImages(item, results));
+					} else {
+						for (const key in obj) {
+							if (key === 'image' || key === 'thumbnail' || key === 'url') {
+								const val = obj[key];
+								if (typeof val === 'string') {
+									const url = toAbsoluteUrl(val);
+									if (isLikelyProductImage(url)) {
+										results.add(url);
+									}
+								}
+								if (Array.isArray(val)) {
+									val.forEach((item: any) => {
+										if (typeof item === 'string') {
+											const url = toAbsoluteUrl(item);
+											if (isLikelyProductImage(url)) {
+												results.add(url);
+											}
+										}
+									});
+								}
+							} else if (typeof obj[key] === 'object') {
+								extractImages(obj[key], results);
+							}
+						}
+					}
+				}
+
+				const jsonImages = new Set<string>();
+				extractImages(jsonData, jsonImages);
+				jsonImages.forEach(img => {
+					if (!imageUrls.includes(img)) {
+						imageUrls.push(img);
+					}
+				});
+			} catch {
+				// JSON parsing failed, continue
 			}
 		}

-		// Priority 3: data-src attributes (lazy loaded images)
-		while ((match = dataSrcRegex.exec(html)) !== null) {
+		// Priority 3: Look for data-image attributes (common in React/SPA)
+		const dataImageRegex = /<[^>]+data-image=["']([^"'>]+)["']/gi;
+		while ((match = dataImageRegex.exec(html)) !== null) {
 			const url = toAbsoluteUrl(match[1]);
-			if (!imageUrls.includes(url)) {
+			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
 				imageUrls.push(url);
 			}
 		}

 		// Priority 4: srcset attributes (responsive images)
+		const srcsetRegex = /<img[^>]+srcset=["']([^"'>]+)["']/gi;
 		while ((match = srcsetRegex.exec(html)) !== null) {
 			const srcsetValue = match[1];
-			// srcset can have multiple URLs with sizes, extract them
 			const srcsetUrls = srcsetValue.split(',').map((s) => {
 				const parts = s.trim().split(/\s+/);
-				return parts[0]; // Get the URL part before size descriptor
+				return parts[0];
 			});
 			for (const srcsetUrl of srcsetUrls) {
 				const url = toAbsoluteUrl(srcsetUrl);
-				if (!imageUrls.includes(url)) {
+				if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
 					imageUrls.push(url);
 				}
 			}
 		}

-		// Priority 5: Regular img src attributes
-		while ((match = imgRegex.exec(html)) !== null) {
+		// Priority 5: data-src attributes (lazy loaded)
+		const dataSrcRegex = /<img[^>]+data-src=["']([^"'>]+)["']/gi;
+		while ((match = dataSrcRegex.exec(html)) !== null) {
 			const url = toAbsoluteUrl(match[1]);
-			if (!imageUrls.includes(url)) {
+			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
 				imageUrls.push(url);
 			}
 		}

-		const filteredImages = imageUrls.filter(
-			(url) =>
-				!url.toLowerCase().includes('logo') &&
-				!url.toLowerCase().includes('icon') &&
-				!url.toLowerCase().includes('sprite') &&
-				!url.toLowerCase().includes('favicon') &&
-				!url.endsWith('.svg') &&
-				url.length < 1000 && // Increased limit for modern CDN URLs
-				!url.includes('data:image') // Skip data URLs
-		);
+		// Priority 6: Regular img src attributes
+		const imgRegex = /<img[^>]+src=["']([^"'>]+)["']/gi;
+		while ((match = imgRegex.exec(html)) !== null) {
+			const url = toAbsoluteUrl(match[1]);
+			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
+				imageUrls.push(url);
+			}
+		}

-		return json({ images: filteredImages.slice(0, 30) });
+		// Priority 7: Background images in style attributes (common in some e-commerce)
+		const bgImageRegex = /background(-image)?:\s*url\(["']?([^"')]*)["']?/gi;
+		while ((match = bgImageRegex.exec(html)) !== null) {
+			const url = toAbsoluteUrl(match[1]);
+			if (isLikelyProductImage(url) && !imageUrls.includes(url) && !url.startsWith('data:')) {
+				imageUrls.push(url);
+			}
+		}
+
+		// Final filtering: remove very long URLs and duplicates
+		const finalImages = [...new Set(imageUrls)].filter(url => {
+			return url.length < 2000 && isLikelyProductImage(url);
+		});
+
+		return json({ images: finalImages.slice(0, 30) });
 	} catch (error) {
 		return json({ error: 'Failed to scrape images' }, { status: 500 });
 	}