wishlist/src/routes/api/scrape-images/+server.ts

import { json } from '@sveltejs/kit';
import type { RequestHandler } from './$types';

export const POST: RequestHandler = async ({ request }) => {
	const { url } = await request.json();

	if (!url) {
		return json({ error: 'URL is required' }, { status: 400 });
	}

	try {
		const response = await fetch(url, {
			headers: {
				'User-Agent':
					'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
				'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
				'Accept-Language': 'en-US,en;q=0.9',
				'Referer': 'https://www.google.com/'
			}
		});

		if (!response.ok) {
			return json({ error: 'Failed to fetch URL' }, { status: 400 });
		}

		const html = await response.text();
		const baseUrl = new URL(url);
		const origin = baseUrl.origin;

		const imageUrls: string[] = [];
		// Match various image source patterns
		const imgRegex = /<img[^>]+src=["']([^"'>]+)["']/gi;
		const srcsetRegex = /<img[^>]+srcset=["']([^"'>]+)["']/gi;
		const dataSrcRegex = /<img[^>]+data-src=["']([^"'>]+)["']/gi;
		const ogImageRegex = /<meta[^>]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
		const twitterImageRegex = /<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
		const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi;

		function toAbsoluteUrl(imgUrl: string): string {
			if (imgUrl.startsWith('http')) {
				return imgUrl;
			}
			if (imgUrl.startsWith('//')) {
				return `https:${imgUrl}`;
			}
			if (imgUrl.startsWith('/')) {
				return `${origin}${imgUrl}`;
			}
			return `${origin}/${imgUrl}`;
		}

		let match;

		// Priority 1: OpenGraph and Twitter meta tags (usually the best product images)
		while ((match = ogImageRegex.exec(html)) !== null) {
			imageUrls.push(toAbsoluteUrl(match[1]));
		}

		while ((match = twitterImageRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (!imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 2: JSON-LD structured data (common for e-commerce)
		while ((match = jsonLdRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (!imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 3: data-src attributes (lazy loaded images)
		while ((match = dataSrcRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (!imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		// Priority 4: srcset attributes (responsive images)
		while ((match = srcsetRegex.exec(html)) !== null) {
			const srcsetValue = match[1];
			// srcset can have multiple URLs with sizes, extract them
			const srcsetUrls = srcsetValue.split(',').map((s) => {
				const parts = s.trim().split(/\s+/);
				return parts[0]; // Get the URL part before size descriptor
			});
			for (const srcsetUrl of srcsetUrls) {
				const url = toAbsoluteUrl(srcsetUrl);
				if (!imageUrls.includes(url)) {
					imageUrls.push(url);
				}
			}
		}

		// Priority 5: Regular img src attributes
		while ((match = imgRegex.exec(html)) !== null) {
			const url = toAbsoluteUrl(match[1]);
			if (!imageUrls.includes(url)) {
				imageUrls.push(url);
			}
		}

		const filteredImages = imageUrls.filter(
			(url) =>
				!url.toLowerCase().includes('logo') &&
				!url.toLowerCase().includes('icon') &&
				!url.toLowerCase().includes('sprite') &&
				!url.toLowerCase().includes('favicon') &&
				!url.endsWith('.svg') &&
				url.length < 1000 && // Increased limit for modern CDN URLs
				!url.includes('data:image') // Skip data URLs
		);

		return json({ images: filteredImages.slice(0, 30) });
	} catch (error) {
		return json({ error: 'Failed to scrape images' }, { status: 500 });
	}
};