diff --git a/src/routes/api/scrape-images/+server.ts b/src/routes/api/scrape-images/+server.ts
index a4515dd..bdd7105 100644
--- a/src/routes/api/scrape-images/+server.ts
+++ b/src/routes/api/scrape-images/+server.ts
@@ -35,7 +35,9 @@ export const POST: RequestHandler = async ({ request }) => {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
- 'Referer': 'https://www.google.com/'
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Cache-Control': 'no-cache',
+ 'Pragma': 'no-cache'
}
});
@@ -48,13 +50,6 @@ export const POST: RequestHandler = async ({ request }) => {
const origin = baseUrl.origin;
const imageUrls: string[] = [];
- // Match various image source patterns
- const imgRegex = /
]+src=["']([^"'>]+)["']/gi;
- const srcsetRegex = /
]+srcset=["']([^"'>]+)["']/gi;
- const dataSrcRegex = /
]+data-src=["']([^"'>]+)["']/gi;
- const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
- const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
- const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi;
function toAbsoluteUrl(imgUrl: string): string {
if (imgUrl.startsWith('http')) {
@@ -69,72 +64,157 @@ export const POST: RequestHandler = async ({ request }) => {
return `${origin}/${imgUrl}`;
}
+ function isLikelyProductImage(url: string): boolean {
+ const lower = url.toLowerCase();
+ const badPatterns = [
+ 'logo', 'icon', 'sprite', 'favicon', 'banner', 'footer',
+ 'header', 'background', 'pattern', 'placeholder', 'thumbnail-small',
+ 'btn', 'button', 'menu', 'nav', 'navigation', 'social',
+ 'instagram', 'facebook', 'twitter', 'linkedin', 'pinterest'
+ ];
+ if (badPatterns.some(pattern => lower.includes(pattern))) {
+ return false;
+ }
+ if (url.endsWith('.svg')) {
+ return false;
+ }
+ if (lower.includes('data:image')) {
+ return false;
+ }
+ if (lower.includes('loading') || lower.includes('spinner') || lower.includes('skeleton')) {
+ return false;
+ }
+ return true;
+ }
+
let match;
- // Priority 1: OpenGraph and Twitter meta tags (usually the best product images)
+ // Priority 1: OpenGraph and Twitter meta tags (main product image)
+ const ogImageRegex = /]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
+ const twitterImageRegex = /]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
+
while ((match = ogImageRegex.exec(html)) !== null) {
- imageUrls.push(toAbsoluteUrl(match[1]));
+ const url = toAbsoluteUrl(match[1]);
+ if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
+ imageUrls.push(url);
+ }
}
while ((match = twitterImageRegex.exec(html)) !== null) {
const url = toAbsoluteUrl(match[1]);
- if (!imageUrls.includes(url)) {
+ if (isLikelyProductImage(url) && !imageUrls.includes(url)) {
imageUrls.push(url);
}
}
- // Priority 2: JSON-LD structured data (common for e-commerce)
+ // Priority 2: Look for JSON-LD structured data (very common in modern e-commerce)
+ const jsonLdRegex = /