import { json } from '@sveltejs/kit'; import type { RequestHandler } from './$types'; function isValidUrl(urlString: string): boolean { try { const url = new URL(urlString); if (!['http:', 'https:'].includes(url.protocol)) { return false; } const hostname = url.hostname.toLowerCase(); if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1') { return false; } return true; } catch { return false; } } export const POST: RequestHandler = async ({ request }) => { const { url } = await request.json(); if (!url) { return json({ error: 'URL is required' }, { status: 400 }); } if (!isValidUrl(url)) { return json({ error: 'Invalid URL' }, { status: 400 }); } try { const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' } }); if (!response.ok) { return json({ error: 'Failed to fetch URL' }, { status: 400 }); } const html = await response.text(); const baseUrl = new URL(url); const origin = baseUrl.origin; const imageUrls: string[] = []; function toAbsoluteUrl(imgUrl: string): string { if (imgUrl.startsWith('http')) { return imgUrl; } if (imgUrl.startsWith('//')) { return `https:${imgUrl}`; } if (imgUrl.startsWith('/')) { return `${origin}${imgUrl}`; } return `${origin}/${imgUrl}`; } function isLikelyProductImage(url: string): boolean { const lower = url.toLowerCase(); const badPatterns = [ 'logo', 'icon', 'sprite', 'favicon', 'banner', 'footer', 'header', 'background', 'pattern', 'placeholder', 'thumbnail-small', 'btn', 'button', 'menu', 'nav', 'navigation', 'social', 'instagram', 'facebook', 'twitter', 'linkedin', 'pinterest' ]; if (badPatterns.some(pattern => lower.includes(pattern))) { return false; } if (url.endsWith('.svg')) { return false; } if (lower.includes('data:image')) { return false; } if (lower.includes('loading') || lower.includes('spinner') || lower.includes('skeleton')) { return false; } return true; } let match; // Priority 1: OpenGraph and Twitter meta tags (main product image) const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi; const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi; while ((match = ogImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } while ((match = twitterImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 2: Look for JSON-LD structured data (very common in modern e-commerce) const jsonLdRegex = /]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; while ((match = jsonLdRegex.exec(html)) !== null) { try { const jsonStr = match[1]; const jsonData = JSON.parse(jsonStr); function extractImages(obj: any, results: Set) { if (!obj || typeof obj !== 'object') return; if (Array.isArray(obj)) { obj.forEach((item: any) => extractImages(item, results)); } else { for (const key in obj) { if (key === 'image' || key === 'thumbnail' || key === 'url') { const val = obj[key]; if (typeof val === 'string') { const url = toAbsoluteUrl(val); if (isLikelyProductImage(url)) { results.add(url); } } if (Array.isArray(val)) { val.forEach((item: any) => { if (typeof item === 'string') { const url = toAbsoluteUrl(item); if (isLikelyProductImage(url)) { results.add(url); } } }); } } else if (typeof obj[key] === 'object') { extractImages(obj[key], results); } } } } const jsonImages = new Set(); extractImages(jsonData, jsonImages); jsonImages.forEach(img => { if (!imageUrls.includes(img)) { imageUrls.push(img); } }); } catch { // JSON parsing failed, continue } } // Priority 3: Look for data-image attributes (common in React/SPA) const dataImageRegex = /<[^>]+data-image=["']([^"'>]+)["']/gi; while ((match = dataImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 4: srcset attributes (responsive images) const srcsetRegex = /]+srcset=["']([^"'>]+)["']/gi; while ((match = srcsetRegex.exec(html)) !== null) { const srcsetValue = match[1]; const srcsetUrls = srcsetValue.split(',').map((s) => { const parts = s.trim().split(/\s+/); return parts[0]; }); for (const srcsetUrl of srcsetUrls) { const url = toAbsoluteUrl(srcsetUrl); if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } } // Priority 5: data-src attributes (lazy loaded) const dataSrcRegex = /]+data-src=["']([^"'>]+)["']/gi; while ((match = dataSrcRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 6: Regular img src attributes const imgRegex = /]+src=["']([^"'>]+)["']/gi; while ((match = imgRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (isLikelyProductImage(url) && !imageUrls.includes(url)) { imageUrls.push(url); } } // Priority 7: Background images in style attributes (common in some e-commerce) const bgImageRegex = /background(-image)?:\s*url\(["']?([^"')]*)["']?/gi; while ((match = bgImageRegex.exec(html)) !== null) { const url = toAbsoluteUrl(match[1]); if (isLikelyProductImage(url) && !imageUrls.includes(url) && !url.startsWith('data:')) { imageUrls.push(url); } } // Final filtering: remove very long URLs and duplicates const finalImages = [...new Set(imageUrls)].filter(url => { return url.length < 2000 && isLikelyProductImage(url); }); return json({ images: finalImages.slice(0, 30) }); } catch (error) { return json({ error: 'Failed to scrape images' }, { status: 500 }); } };