wishlist/src/routes/api/scrape-images/+server.ts

import { json } from '@sveltejs/kit';
import type { RequestHandler } from './$types';

function isValidUrl(urlString: string): boolean {
	try {
		const url = new URL(urlString);
		if (!['http:', 'https:'].includes(url.protocol)) {
			return false;
		}
		const hostname = url.hostname.toLowerCase();
		if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1') {
			return false;
		}
		return true;
	} catch {
		return false;
	}
}

export const POST: RequestHandler = async ({ request }) => {
	const { url } = await request.json();

	if (!url) {
		return json({ error: 'URL is required' }, { status: 400 });
	}

	if (!isValidUrl(url)) {
		return json({ error: 'Invalid URL' }, { status: 400 });
	}

	try {
		const response = await fetch(url, {
			headers: {
				'User-Agent':
					'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
				'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
				'Accept-Language': 'en-US,en;q=0.9',
				'Accept-Encoding': 'gzip, deflate, br',
				'Cache-Control': 'no-cache',
				'Pragma': 'no-cache'
			}
		});

		if (!response.ok) {
			return json({ error: 'Failed to fetch URL' }, { status: 400 });
		}

		const html = await response.text();
		const baseUrl = new URL(url);
		const origin = baseUrl.origin;

		const imageUrls: string[] = [];

		function toAbsoluteUrl(imgUrl: string): string {
			if (imgUrl.startsWith('http')) {
				return imgUrl;
			}
			if (imgUrl.startsWith('//')) {
				return `https:${imgUrl}`;
			}
			if (imgUrl.startsWith('/')) {
				return `${origin}${imgUrl}`;
			}
			return `${origin}/${imgUrl}`;
		}

		function isLikelyProductImage(url: string): boolean {
			const lower = url.toLowerCase();
			const badPatterns = [
				'logo', 'icon', 'sprite', 'favicon', 'banner', 'footer',
				'header', 'background', 'pattern', 'placeholder', 'thumbnail-small',
				'btn', 'button', 'menu', 'nav', 'navigation', 'social',
				'instagram', 'facebook', 'twitter', 'linkedin', 'pinterest'
			];
			if (badPatterns.some(pattern => lower.includes(pattern))) {
				return false;
			}
			if (url.endsWith('.svg')) {
				return false;
			}
			if (lower.includes('data:image')) {
				return false;
			}
			if (lower.includes('loading') || lower.includes('spinner') || lower.includes('skeleton')) {
				return false;
			}
			return true;
		}

		let match;

		// Priority 1: OpenGraph and Twitter meta tags (main product image)
		const ogImageRegex = /<meta[^>]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
		const twitterImageRegex = /<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;

		while ((match = ogImageRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		while ((match = twitterImageRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 2: Look for JSON-LD structured data (very common in modern e-commerce)
		const jsonLdRegex = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
		while ((match = jsonLdRegex.exec(html)) !== null) {
			try {
				const jsonStr = match[1];
				const jsonData = JSON.parse(jsonStr);

				function extractImages(obj: any, results: Set<string>) {
					if (!obj || typeof obj !== 'object') return;
					if (Array.isArray(obj)) {
						obj.forEach((item: any) => extractImages(item, results));
					} else {
						for (const key in obj) {
							if (key === 'image' || key === 'thumbnail' || key === 'url') {
								const val = obj[key];
								if (typeof val === 'string') {
									const url = toAbsoluteUrl(val);
									if (isLikelyProductImage(url)) {
										results.add(url);
									}
								}
								if (Array.isArray(val)) {
									val.forEach((item: any) => {
										if (typeof item === 'string') {
											const url = toAbsoluteUrl(item);
											if (isLikelyProductImage(url)) {
												results.add(url);
											}
										}
									});
								}
							} else if (typeof obj[key] === 'object') {
								extractImages(obj[key], results);
							}
						}
					}
				}

				const jsonImages = new Set<string>();
				extractImages(jsonData, jsonImages);
				jsonImages.forEach(img => {
					if (!imageUrls.includes(img)) {
						imageUrls.push(img);
					}
				});
			} catch {
				// JSON parsing failed, continue
			}
		}

		// Priority 3: Look for data-image attributes (common in React/SPA)
		const dataImageRegex = /<[^>]+data-image=["']([^"'>]+)["']/gi;
		while ((match = dataImageRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 4: srcset attributes (responsive images)
		const srcsetRegex = /<img[^>]+srcset=["']([^"'>]+)["']/gi;
		while ((match = srcsetRegex.exec(html)) !== null) {
			const srcsetValue = match[1];
			const srcsetUrls = srcsetValue.split(',').map((s) => {
				const parts = s.trim().split(/\s+/);
				return parts[0];
			});
			for (const srcsetUrl of srcsetUrls) {
				const url = toAbsoluteUrl(srcsetUrl);
				if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
					imageUrls.push(url);
				}
			}
		}

		// Priority 5: data-src attributes (lazy loaded)
		const dataSrcRegex = /<img[^>]+data-src=["']([^"'>]+)["']/gi;
		while ((match = dataSrcRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 6: Regular img src attributes
		const imgRegex = /<img[^>]+src=["']([^"'>]+)["']/gi;
		while ((match = imgRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 7: Background images in style attributes (common in some e-commerce)
		const bgImageRegex = /background(-image)?:\s*url\(["']?([^"')]*)["']?/gi;
		while ((match = bgImageRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (isLikelyProductImage(url) && !imageUrls.includes(url) && !url.startsWith('data:')) {
				imageUrls.push(url);
			}
		}

		// Final filtering: remove very long URLs and duplicates
		const finalImages = [...new Set(imageUrls)].filter(url => {
			return url.length < 2000 && isLikelyProductImage(url);
		});

		return json({ images: finalImages.slice(0, 30) });
	} catch (error) {
		return json({ error: 'Failed to scrape images' }, { status: 500 });
	}
};