import { json } from '@sveltejs/kit'; import type { RequestHandler } from './$types'; export const POST: RequestHandler = async ({ request }) => { const { url } = await request.json(); if (!url) { return json({ error: 'URL is required' }, { status: 400 }); } try { const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://www.google.com/' } }); if (!response.ok) { return json({ error: 'Failed to fetch URL' }, { status: 400 }); } const html = await response.text(); const baseUrl = new URL(url); const origin = baseUrl.origin; const imageUrls: string[] = []; // Match various image source patterns const imgRegex = /]+src=["']([^"'>]+)["']/gi; const srcsetRegex = /]+srcset=["']([^"'>]+)["']/gi; const dataSrcRegex = /]+data-src=["']([^"'>]+)["']/gi; const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi; const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi; const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi; function toAbsoluteUrl(imgUrl: string): string { if (imgUrl.startsWith('http')) { return imgUrl; } if (imgUrl.startsWith('//')) { return `https:${imgUrl}`; } if (imgUrl.startsWith('/')) { return `${origin}${imgUrl}`; } return `${origin}/${imgUrl}`; } let match; // Priority 1: OpenGraph and Twitter meta tags (usually the best product images) while ((match = ogImageRegex.exec(html)) !== null) { imageUrls.push(toAbsoluteUrl(match[1])); } while ((match = twitterImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (!imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 2: JSON-LD structured data (common for e-commerce) while ((match = jsonLdRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (!imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 3: data-src attributes (lazy loaded images) while ((match = dataSrcRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (!imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 4: srcset attributes (responsive images) while ((match = srcsetRegex.exec(html)) !== null) { const srcsetValue = match[1]; // srcset can have multiple URLs with sizes, extract them const srcsetUrls = srcsetValue.split(',').map((s) => { const parts = s.trim().split(/\s+/); return parts[0]; // Get the URL part before size descriptor }); for (const srcsetUrl of srcsetUrls) { const url = toAbsoluteUrl(srcsetUrl); if (!imageUrls.includes(url)) { imageUrls.push(url); } } } // Priority 5: Regular img src attributes while ((match = imgRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (!imageUrls.includes(url)) { imageUrls.push(url); } } const filteredImages = imageUrls.filter( (url) => !url.toLowerCase().includes('logo') && !url.toLowerCase().includes('icon') && !url.toLowerCase().includes('sprite') && !url.toLowerCase().includes('favicon') && !url.endsWith('.svg') && url.length < 1000 && // Increased limit for modern CDN URLs !url.includes('data:image') // Skip data URLs ); return json({ images: filteredImages.slice(0, 30) }); } catch (error) { return json({ error: 'Failed to scrape images' }, { status: 500 }); } };