Skip to content

Commit

Permalink
Merge pull request #1015 from mendableai/nsc/improves-sitemap-fetching
Browse files Browse the repository at this point in the history
Improves sitemap fetching
  • Loading branch information
nickscamara authored Dec 27, 2024
2 parents 2ea0e9a + 4451c4f commit c5b6495
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 36 deletions.
79 changes: 55 additions & 24 deletions apps/api/src/scraper/WebScraper/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -446,44 +446,75 @@ export class WebCrawler {
};

const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;

let sitemapLinks: string[] = [];

// Try to get sitemap from the provided URL first
try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
}
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl, allUrls: [], mode: "fire-engine" },
this.logger,
);
} catch (error) {
this.logger.debug(
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
`Failed to fetch sitemap from ${sitemapUrl}`,
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
);
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap(
{ sitemapUrl, mode: "fire-engine" },
this.logger,
);
if (response) {
sitemapLinks = response;
}

// If this is a subdomain, also try to get sitemap from the main domain
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const domainParts = hostname.split('.');

// Check if this is a subdomain (has more than 2 parts and not www)
if (domainParts.length > 2 && domainParts[0] !== 'www') {
// Get the main domain by taking the last two parts
const mainDomain = domainParts.slice(-2).join('.');
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;

try {
// Get all links from the main domain's sitemap
const mainDomainLinks = await getLinksFromSitemap(
{ sitemapUrl: mainDomainSitemapUrl, allUrls: [], mode: "fire-engine" },
this.logger,
);
// Filter links to only include those pointing to the current subdomain
const subdomainLinks = mainDomainLinks.filter(link => {
try {
const linkUrl = new URL(link);
return linkUrl.hostname.endsWith(hostname);
} catch {
return false;
}
});
sitemapLinks = [...new Set([...sitemapLinks, ...subdomainLinks])];
} catch (error) {
this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
}
}
} catch (error) {
this.logger.debug(`Error processing main domain sitemap`, {
method: "tryFetchSitemapLinks",
url,
error,
});
}

// If no sitemap found yet, try the baseUrl as a last resort
if (sitemapLinks.length === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
const response = await axios.get(baseUrlSitemap, {
timeout: axiosTimeout,
});
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
this.logger,
);
}
const baseLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, allUrls: [], mode: "fire-engine" },
this.logger,
);

sitemapLinks = [...new Set([...sitemapLinks, ...baseLinks])];
} catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks",
Expand Down
59 changes: 47 additions & 12 deletions apps/api/src/scraper/WebScraper/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ import { WebCrawler } from "./crawler";
import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types";
import type { Logger } from "winston";

const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
export async function getLinksFromSitemap(
{
sitemapUrl,
Expand All @@ -21,10 +23,7 @@ export async function getLinksFromSitemap(
try {
let content: string = "";
try {
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === "fire-engine") {
if (mode === "fire-engine" && useFireEngine) {
const response = await scrapeURL(
"sitemap",
sitemapUrl,
Expand All @@ -35,6 +34,9 @@ export async function getLinksFromSitemap(
throw response.error;
}
content = response.document.rawHtml!;
} else {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
}
} catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, {
Expand All @@ -43,29 +45,62 @@ export async function getLinksFromSitemap(
sitemapUrl,
error,
});

return allUrls;
}

const parsed = await parseStringPromise(content);
const root = parsed.urlset || parsed.sitemapindex;

if (root && root.sitemap) {
const sitemapPromises = root.sitemap
// Handle sitemap index files
const sitemapUrls = root.sitemap
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
.map((sitemap) =>
.map((sitemap) => sitemap.loc[0]);

const sitemapPromises = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl, allUrls: [], mode },
logger,
),
);

const results = await Promise.all(sitemapPromises);
results.forEach(urls => {
allUrls.push(...urls);
});
} else if (root && root.url) {
// Check if any URLs point to additional sitemaps
const xmlSitemaps = root.url
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
url.loc[0].toLowerCase().endsWith('.xml')
)
.map((url) => url.loc[0]);

if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
{ sitemapUrl, allUrls: [], mode },
logger,
),
);
await Promise.all(sitemapPromises);
} else if (root && root.url) {

const results = await Promise.all(sitemapPromises);
results.forEach(urls => {
allUrls.push(...urls);
});
}

// Add regular URLs that aren't sitemaps
const validUrls = root.url
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
!url.loc[0].toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0]),
)
.map((url) => url.loc[0]);
Expand All @@ -80,7 +115,7 @@ export async function getLinksFromSitemap(
});
}

return allUrls;
return [...new Set(allUrls)];
}

export const fetchSitemapData = async (
Expand Down

0 comments on commit c5b6495

Please sign in to comment.