Skip to content

Commit

Permalink
Merge pull request #358 from Quickchive/chore/QA-356/improve-og-crawling
Browse files Browse the repository at this point in the history
refactor: get og data by using object
  • Loading branch information
stae1102 authored Nov 2, 2024
2 parents 18b0e0a + d0b6240 commit b3dbc4f
Showing 1 changed file with 100 additions and 52 deletions.
152 changes: 100 additions & 52 deletions src/contents/util/content.util.ts
Original file line number Diff line number Diff line change
@@ -1,64 +1,112 @@
import { BadRequestException } from '@nestjs/common';
import {
BadRequestException,
InternalServerErrorException,
} from '@nestjs/common';
import * as cheerio from 'cheerio';
import axios from 'axios';
import axios, { AxiosResponse } from 'axios';

export const getLinkInfo = async (link: string) => {
let title: string | undefined = '';
let coverImg: string | undefined = '';
let description: string | undefined = '';
let siteName: string | undefined;
interface OGData {
title: string;
description: string;
image: string;
url: string;
type: string;
site_name: string;
[key: string]: string; // 추가 OG 태그를 위한 인덱스 시그니처
}

if (!link.match(/^(http|https):\/\//)) {
link = `http://${link}`;
class OGCrawler {
private readonly timeout: number;
private readonly userAgent: string;
private readonly maxRedirects = 5;

constructor(options: { timeout?: number; userAgent?: string } = {}) {
this.timeout = options.timeout || 5000;
this.userAgent =
options.userAgent || 'Mozilla/5.0 (compatible; OGCrawler/1.0)';
}

// ! TODO 크롤링에서 대략 초 단위 시간 소요 -> 개선 필요
await axios
.get(link, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; OGCrawler/1.0)',
},
})
.then((res) => {
if (res.status !== 200) {
throw new BadRequestException('잘못된 링크입니다.');
} else {
const data = res.data;
if (typeof data === 'string') {
const $ = cheerio.load(data);
title =
$('title').text() !== ''
? $('title').text() || $('meta[name="title"]').attr('content')
: 'Untitled';
$('meta').each((i, el) => {
const meta = $(el);
if (meta.attr('property') === 'og:image') {
coverImg = meta.attr('content');
}
if (meta.attr('property') === 'og:description') {
description = meta.attr('content');
}
if (meta.attr('property') === 'og:site_name') {
siteName = meta.attr('content');
}
});
}
public async fetch(url: string): Promise<OGData> {
try {
const response: AxiosResponse = await axios({
method: 'get',
url,
timeout: this.timeout,
headers: {
'User-Agent': this.userAgent,
},
maxRedirects: this.maxRedirects,
});

return this.parse(response.data);
} catch (error) {
if (error instanceof Error) {
throw new InternalServerErrorException(
`Failed to fetch URL: ${error.message}`,
);
}
})
.catch((e) => {
// Control unreachable link
// if(e.message === 'Request failed with status code 403') {
// 403 에러가 발생하는 링크는 크롤링이 불가능한 링크이다.
// }
for (let idx = 1; idx < 3; idx++) {
if (link.split('/').at(-idx) !== '') {
title = link.split('/').at(-idx);
break;
}
throw new InternalServerErrorException('An unknown error occurred');
}
}

private parse(html: string): OGData {
const $ = cheerio.load(html);
const ogData: Partial<OGData> = {};

// OG 태그 파싱
$('meta[property^="og:"]').each((_, element) => {
const property = $(element).attr('property')?.replace('og:', '');
const content = $(element).attr('content');

if (property && content) {
ogData[property as keyof OGData] = content;
}
title = title ? title : 'Untitled';
});

// 기본 메타 태그 백업 파싱
if (!ogData.title) {
ogData.title =
$('title').text() || $('meta[name="title"]').attr('content') || '';
}

if (!ogData.description) {
ogData.description = $('meta[name="description"]').attr('content') || '';
}

// 이미지 URL 정규화
if (ogData.image && !ogData.image.startsWith('http')) {
const baseUrl = $('base').attr('href');
if (baseUrl) {
try {
ogData.image = new URL(ogData.image, baseUrl).href;
} catch (error) {
console.warn('Failed to normalize image URL:', error);
}
}
}

// 필수 필드가 있는 완성된 객체 반환
return {
title: ogData.title || '',
description: ogData.description || '',
image: ogData.image || '',
url: ogData.url || '',
type: ogData.type || '',
site_name: ogData.site_name || '',
...ogData, // 추가 OG 태그 포함
};
}
}

export const getLinkInfo = async (link: string) => {
const crawler = new OGCrawler();
const ogData = await crawler.fetch(link);

const title = ogData.title;
const description = ogData.description;
const coverImg = ogData.image;
const siteName = ogData.site_name;

return {
title,
description,
Expand Down

0 comments on commit b3dbc4f

Please sign in to comment.