From 4451c4f67153fae0876f2e7136e2f1a4932374d9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 26 Dec 2024 13:51:20 -0300 Subject: [PATCH] Nick: --- apps/api/src/scraper/WebScraper/crawler.ts | 79 +++++++++++++++------- apps/api/src/scraper/WebScraper/sitemap.ts | 59 ++++++++++++---- 2 files changed, 102 insertions(+), 36 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 2e47d3526..41bee2d6d 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -446,44 +446,75 @@ export class WebCrawler { }; const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; - let sitemapLinks: string[] = []; + // Try to get sitemap from the provided URL first try { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger); - } + sitemapLinks = await getLinksFromSitemap( + { sitemapUrl, allUrls: [], mode: "fire-engine" }, + this.logger, + ); } catch (error) { this.logger.debug( - `Failed to fetch sitemap with axios from ${sitemapUrl}`, + `Failed to fetch sitemap from ${sitemapUrl}`, { method: "tryFetchSitemapLinks", sitemapUrl, error }, ); - if (error instanceof AxiosError && error.response?.status === 404) { - // ignore 404 - } else { - const response = await getLinksFromSitemap( - { sitemapUrl, mode: "fire-engine" }, - this.logger, - ); - if (response) { - sitemapLinks = response; + } + + // If this is a subdomain, also try to get sitemap from the main domain + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const domainParts = hostname.split('.'); + + // Check if this is a subdomain (has more than 2 parts and not www) + if (domainParts.length > 2 && domainParts[0] !== 'www') { + // Get the main domain by taking the last two parts + const mainDomain = domainParts.slice(-2).join('.'); + const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`; + const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`; + + try { + // Get all links from the main domain's sitemap + const mainDomainLinks = await getLinksFromSitemap( + { sitemapUrl: mainDomainSitemapUrl, allUrls: [], mode: "fire-engine" }, + this.logger, + ); + // Filter links to only include those pointing to the current subdomain + const subdomainLinks = mainDomainLinks.filter(link => { + try { + const linkUrl = new URL(link); + return linkUrl.hostname.endsWith(hostname); + } catch { + return false; + } + }); + sitemapLinks = [...new Set([...sitemapLinks, ...subdomainLinks])]; + } catch (error) { + this.logger.debug( + `Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`, + { method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error }, + ); } } + } catch (error) { + this.logger.debug(`Error processing main domain sitemap`, { + method: "tryFetchSitemapLinks", + url, + error, + }); } + // If no sitemap found yet, try the baseUrl as a last resort if (sitemapLinks.length === 0) { const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap, { - timeout: axiosTimeout, - }); - if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap( - { sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, - this.logger, - ); - } + const baseLinks = await getLinksFromSitemap( + { sitemapUrl: baseUrlSitemap, allUrls: [], mode: "fire-engine" }, + this.logger, + ); + + sitemapLinks = [...new Set([...sitemapLinks, ...baseLinks])]; } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { method: "tryFetchSitemapLinks", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c080373e2..2529c022d 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -5,7 +5,9 @@ import { WebCrawler } from "./crawler"; import { scrapeURL } from "../scrapeURL"; import { scrapeOptions } from "../../controllers/v1/types"; import type { Logger } from "winston"; - +const useFireEngine = + process.env.FIRE_ENGINE_BETA_URL !== "" && + process.env.FIRE_ENGINE_BETA_URL !== undefined; export async function getLinksFromSitemap( { sitemapUrl, @@ -21,10 +23,7 @@ export async function getLinksFromSitemap( try { let content: string = ""; try { - if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - content = response.data; - } else if (mode === "fire-engine") { + if (mode === "fire-engine" && useFireEngine) { const response = await scrapeURL( "sitemap", sitemapUrl, @@ -35,6 +34,9 @@ export async function getLinksFromSitemap( throw response.error; } content = response.document.rawHtml!; + } else { + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + content = response.data; } } catch (error) { logger.error(`Request failed for ${sitemapUrl}`, { @@ -43,7 +45,6 @@ export async function getLinksFromSitemap( sitemapUrl, error, }); - return allUrls; } @@ -51,21 +52,55 @@ export async function getLinksFromSitemap( const root = parsed.urlset || parsed.sitemapindex; if (root && root.sitemap) { - const sitemapPromises = root.sitemap + // Handle sitemap index files + const sitemapUrls = root.sitemap .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) - .map((sitemap) => + .map((sitemap) => sitemap.loc[0]); + + const sitemapPromises = sitemapUrls.map((sitemapUrl) => + getLinksFromSitemap( + { sitemapUrl, allUrls: [], mode }, + logger, + ), + ); + + const results = await Promise.all(sitemapPromises); + results.forEach(urls => { + allUrls.push(...urls); + }); + } else if (root && root.url) { + // Check if any URLs point to additional sitemaps + const xmlSitemaps = root.url + .filter( + (url) => + url.loc && + url.loc.length > 0 && + url.loc[0].toLowerCase().endsWith('.xml') + ) + .map((url) => url.loc[0]); + + if (xmlSitemaps.length > 0) { + // Recursively fetch links from additional sitemaps + const sitemapPromises = xmlSitemaps.map((sitemapUrl) => getLinksFromSitemap( - { sitemapUrl: sitemap.loc[0], allUrls, mode }, + { sitemapUrl, allUrls: [], mode }, logger, ), ); - await Promise.all(sitemapPromises); - } else if (root && root.url) { + + const results = await Promise.all(sitemapPromises); + results.forEach(urls => { + allUrls.push(...urls); + }); + } + + // Add regular URLs that aren't sitemaps const validUrls = root.url .filter( (url) => url.loc && url.loc.length > 0 && + !url.loc[0].toLowerCase().endsWith('.xml') && !WebCrawler.prototype.isFile(url.loc[0]), ) .map((url) => url.loc[0]); @@ -80,7 +115,7 @@ export async function getLinksFromSitemap( }); } - return allUrls; + return [...new Set(allUrls)]; } export const fetchSitemapData = async (