From d0b6240c7d294d4f1352436efcd3113bfb273f08 Mon Sep 17 00:00:00 2001 From: stae1102 Date: Sat, 2 Nov 2024 17:42:56 +0900 Subject: [PATCH] refactor: get og data by using object --- src/contents/util/content.util.ts | 152 ++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 52 deletions(-) diff --git a/src/contents/util/content.util.ts b/src/contents/util/content.util.ts index 5298dba..a4af428 100644 --- a/src/contents/util/content.util.ts +++ b/src/contents/util/content.util.ts @@ -1,64 +1,112 @@ -import { BadRequestException } from '@nestjs/common'; +import { + BadRequestException, + InternalServerErrorException, +} from '@nestjs/common'; import * as cheerio from 'cheerio'; -import axios from 'axios'; +import axios, { AxiosResponse } from 'axios'; -export const getLinkInfo = async (link: string) => { - let title: string | undefined = ''; - let coverImg: string | undefined = ''; - let description: string | undefined = ''; - let siteName: string | undefined; +interface OGData { + title: string; + description: string; + image: string; + url: string; + type: string; + site_name: string; + [key: string]: string; // 추가 OG 태그를 위한 인덱스 시그니처 +} - if (!link.match(/^(http|https):\/\//)) { - link = `http://${link}`; +class OGCrawler { + private readonly timeout: number; + private readonly userAgent: string; + private readonly maxRedirects = 5; + + constructor(options: { timeout?: number; userAgent?: string } = {}) { + this.timeout = options.timeout || 5000; + this.userAgent = + options.userAgent || 'Mozilla/5.0 (compatible; OGCrawler/1.0)'; } - // ! TODO 크롤링에서 대략 초 단위 시간 소요 -> 개선 필요 - await axios - .get(link, { - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; OGCrawler/1.0)', - }, - }) - .then((res) => { - if (res.status !== 200) { - throw new BadRequestException('잘못된 링크입니다.'); - } else { - const data = res.data; - if (typeof data === 'string') { - const $ = cheerio.load(data); - title = - $('title').text() !== '' - ? $('title').text() || $('meta[name="title"]').attr('content') - : 'Untitled'; - $('meta').each((i, el) => { - const meta = $(el); - if (meta.attr('property') === 'og:image') { - coverImg = meta.attr('content'); - } - if (meta.attr('property') === 'og:description') { - description = meta.attr('content'); - } - if (meta.attr('property') === 'og:site_name') { - siteName = meta.attr('content'); - } - }); - } + public async fetch(url: string): Promise { + try { + const response: AxiosResponse = await axios({ + method: 'get', + url, + timeout: this.timeout, + headers: { + 'User-Agent': this.userAgent, + }, + maxRedirects: this.maxRedirects, + }); + + return this.parse(response.data); + } catch (error) { + if (error instanceof Error) { + throw new InternalServerErrorException( + `Failed to fetch URL: ${error.message}`, + ); } - }) - .catch((e) => { - // Control unreachable link - // if(e.message === 'Request failed with status code 403') { - // 403 에러가 발생하는 링크는 크롤링이 불가능한 링크이다. - // } - for (let idx = 1; idx < 3; idx++) { - if (link.split('/').at(-idx) !== '') { - title = link.split('/').at(-idx); - break; - } + throw new InternalServerErrorException('An unknown error occurred'); + } + } + + private parse(html: string): OGData { + const $ = cheerio.load(html); + const ogData: Partial = {}; + + // OG 태그 파싱 + $('meta[property^="og:"]').each((_, element) => { + const property = $(element).attr('property')?.replace('og:', ''); + const content = $(element).attr('content'); + + if (property && content) { + ogData[property as keyof OGData] = content; } - title = title ? title : 'Untitled'; }); + // 기본 메타 태그 백업 파싱 + if (!ogData.title) { + ogData.title = + $('title').text() || $('meta[name="title"]').attr('content') || ''; + } + + if (!ogData.description) { + ogData.description = $('meta[name="description"]').attr('content') || ''; + } + + // 이미지 URL 정규화 + if (ogData.image && !ogData.image.startsWith('http')) { + const baseUrl = $('base').attr('href'); + if (baseUrl) { + try { + ogData.image = new URL(ogData.image, baseUrl).href; + } catch (error) { + console.warn('Failed to normalize image URL:', error); + } + } + } + + // 필수 필드가 있는 완성된 객체 반환 + return { + title: ogData.title || '', + description: ogData.description || '', + image: ogData.image || '', + url: ogData.url || '', + type: ogData.type || '', + site_name: ogData.site_name || '', + ...ogData, // 추가 OG 태그 포함 + }; + } +} + +export const getLinkInfo = async (link: string) => { + const crawler = new OGCrawler(); + const ogData = await crawler.fetch(link); + + const title = ogData.title; + const description = ogData.description; + const coverImg = ogData.image; + const siteName = ogData.site_name; + return { title, description,