add: support for scraping more image links
This commit is contained in:
@@ -11,7 +11,11 @@ export const POST: RequestHandler = async ({ request }) => {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Referer': 'https://www.google.com/'
|
||||
}
|
||||
});
|
||||
|
||||
@@ -24,9 +28,13 @@ export const POST: RequestHandler = async ({ request }) => {
|
||||
const origin = baseUrl.origin;
|
||||
|
||||
const imageUrls: string[] = [];
|
||||
const imgRegex = /<img[^>]+src="([^">]+)"/g;
|
||||
const ogImageRegex = /<meta[^>]+property="og:image"[^>]+content="([^">]+)"/g;
|
||||
const twitterImageRegex = /<meta[^>]+name="twitter:image"[^>]+content="([^">]+)"/g;
|
||||
// Match various image source patterns
|
||||
const imgRegex = /<img[^>]+src=["']([^"'>]+)["']/gi;
|
||||
const srcsetRegex = /<img[^>]+srcset=["']([^"'>]+)["']/gi;
|
||||
const dataSrcRegex = /<img[^>]+data-src=["']([^"'>]+)["']/gi;
|
||||
const ogImageRegex = /<meta[^>]+property=["']og:image["'][^>]+content=["']([^"'>]+)["']/gi;
|
||||
const twitterImageRegex = /<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"'>]+)["']/gi;
|
||||
const jsonLdRegex = /"image"\s*:\s*"([^"]+)"/gi;
|
||||
|
||||
function toAbsoluteUrl(imgUrl: string): string {
|
||||
if (imgUrl.startsWith('http')) {
|
||||
@@ -43,32 +51,70 @@ export const POST: RequestHandler = async ({ request }) => {
|
||||
|
||||
let match;
|
||||
|
||||
// Priority 1: OpenGraph and Twitter meta tags (usually the best product images)
|
||||
while ((match = ogImageRegex.exec(html)) !== null) {
|
||||
imageUrls.push(toAbsoluteUrl(match[1]));
|
||||
}
|
||||
|
||||
while ((match = twitterImageRegex.exec(html)) !== null) {
|
||||
imageUrls.push(toAbsoluteUrl(match[1]));
|
||||
const url = toAbsoluteUrl(match[1]);
|
||||
if (!imageUrls.includes(url)) {
|
||||
imageUrls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 2: JSON-LD structured data (common for e-commerce)
|
||||
while ((match = jsonLdRegex.exec(html)) !== null) {
|
||||
const url = toAbsoluteUrl(match[1]);
|
||||
if (!imageUrls.includes(url)) {
|
||||
imageUrls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 3: data-src attributes (lazy loaded images)
|
||||
while ((match = dataSrcRegex.exec(html)) !== null) {
|
||||
const url = toAbsoluteUrl(match[1]);
|
||||
if (!imageUrls.includes(url)) {
|
||||
imageUrls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 4: srcset attributes (responsive images)
|
||||
while ((match = srcsetRegex.exec(html)) !== null) {
|
||||
const srcsetValue = match[1];
|
||||
// srcset can have multiple URLs with sizes, extract them
|
||||
const srcsetUrls = srcsetValue.split(',').map((s) => {
|
||||
const parts = s.trim().split(/\s+/);
|
||||
return parts[0]; // Get the URL part before size descriptor
|
||||
});
|
||||
for (const srcsetUrl of srcsetUrls) {
|
||||
const url = toAbsoluteUrl(srcsetUrl);
|
||||
if (!imageUrls.includes(url)) {
|
||||
imageUrls.push(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 5: Regular img src attributes
|
||||
while ((match = imgRegex.exec(html)) !== null) {
|
||||
const imgUrl = match[1];
|
||||
const fullUrl = toAbsoluteUrl(imgUrl);
|
||||
if (!imageUrls.includes(fullUrl)) {
|
||||
imageUrls.push(fullUrl);
|
||||
const url = toAbsoluteUrl(match[1]);
|
||||
if (!imageUrls.includes(url)) {
|
||||
imageUrls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
const filteredImages = imageUrls.filter(
|
||||
(url) =>
|
||||
!url.includes('logo') &&
|
||||
!url.includes('icon') &&
|
||||
!url.includes('sprite') &&
|
||||
!url.toLowerCase().includes('logo') &&
|
||||
!url.toLowerCase().includes('icon') &&
|
||||
!url.toLowerCase().includes('sprite') &&
|
||||
!url.toLowerCase().includes('favicon') &&
|
||||
!url.endsWith('.svg') &&
|
||||
url.length < 500
|
||||
url.length < 1000 && // Increased limit for modern CDN URLs
|
||||
!url.includes('data:image') // Skip data URLs
|
||||
);
|
||||
|
||||
return json({ images: filteredImages.slice(0, 20) });
|
||||
return json({ images: filteredImages.slice(0, 30) });
|
||||
} catch (error) {
|
||||
return json({ error: 'Failed to scrape images' }, { status: 500 });
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user