Skip to content

Commit

Permalink
优化采集防范
Browse files Browse the repository at this point in the history
  • Loading branch information
fesiong committed Dec 24, 2020
1 parent bd64954 commit 811050c
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
/vendor
/config.json
/collector*
*packr.go
/test
83 changes: 83 additions & 0 deletions app/handler/article.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,93 @@ func ArticleSourceSaveApi(ctx iris.Context) {
})
return
}
//添加完,马上抓取
core.GetArticleLinks(source)

ctx.JSON(iris.Map{
"code": config.StatusOK,
"msg": "添加/修改成功",
"data": source,
})
}

func ArticlePublishApi(ctx iris.Context) {
var req request.Article
if err := ctx.ReadForm(&req); err != nil {
ctx.JSON(iris.Map{
"code": config.StatusFailed,
"msg": err.Error(),
})
return
}

article, err := provider.GetArticleById(req.ID)
if err != nil {
ctx.JSON(iris.Map{
"code": config.StatusFailed,
"msg": err.Error(),
})
return
}

core.AutoPublish(article)

ctx.JSON(iris.Map{
"code": config.StatusOK,
"msg": "删除成功",
})
}

func ArticleCatchApi(ctx iris.Context) {
var req request.Article
if err := ctx.ReadForm(&req); err != nil {
ctx.JSON(iris.Map{
"code": config.StatusFailed,
"msg": err.Error(),
})
return
}

article, err := provider.GetArticleById(req.ID)
if err != nil {
ctx.JSON(iris.Map{
"code": config.StatusFailed,
"msg": err.Error(),
})
return
}

go core.GetArticleDetail(article)

ctx.JSON(iris.Map{
"code": config.StatusOK,
"msg": "抓取任务已执行",
})
}

func ArticleSourceCatchApi(ctx iris.Context) {
var req request.ArticleSource
if err := ctx.ReadForm(&req); err != nil {
ctx.JSON(iris.Map{
"code": config.StatusFailed,
"msg": err.Error(),
})
return
}

source, err := provider.GetArticleSourceById(req.ID)
if err != nil {
ctx.JSON(iris.Map{
"code": config.StatusFailed,
"msg": err.Error(),
})
return
}

go core.GetArticleLinks(source)

ctx.JSON(iris.Map{
"code": config.StatusOK,
"msg": "抓取任务执行",
})
}
26 changes: 16 additions & 10 deletions app/route/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,22 @@ func Register(app *iris.Application) {
app.Post("/setting", handler.InspectJson, handler.DefaultSettingForm)
app.Post("/publish", handler.InspectJson, handler.PublishSettingForm)

app.Get("/api/index/echarts", handler.InspectJson, handler.IndexEchartsApi)

app.Get("/api/article/list", handler.InspectJson, handler.ArticleListApi)
app.Post("/api/article/delete", handler.InspectJson, handler.ArticleDeleteApi)

app.Get("/api/article/source/list", handler.InspectJson, handler.ArticleSourceListApi)
app.Post("/api/article/source/delete", handler.InspectJson, handler.ArticleSourceDeleteApi)
app.Post("/api/article/source/save", handler.InspectJson, handler.ArticleSourceSaveApi)
app.Get("/api/setting", handler.InspectJson, handler.DefaultSettingApi)
app.Get("/api/publish", handler.InspectJson, handler.PublishSettingApi)
api := app.Party("/api", handler.InspectJson)
{
api.Get("/index/echarts", handler.IndexEchartsApi)

api.Get("/article/list", handler.ArticleListApi)
api.Post("/article/delete", handler.ArticleDeleteApi)
api.Post("/article/publish", handler.ArticlePublishApi)
api.Post("/article/catch", handler.ArticleCatchApi)

api.Get("/article/source/list", handler.ArticleSourceListApi)
api.Post("/article/source/delete", handler.ArticleSourceDeleteApi)
api.Post("/article/source/save", handler.ArticleSourceSaveApi)
api.Post("/article/source/catch", handler.ArticleSourceCatchApi)
api.Get("/setting", handler.DefaultSettingApi)
api.Get("/publish", handler.PublishSettingApi)
}
}

func Cors(ctx iris.Context) {
Expand Down
4 changes: 4 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"os"
"path/filepath"
"regexp"
"strings"
"unicode/utf8"
)

Expand All @@ -25,6 +26,9 @@ func InitJSON() {
sep := string(os.PathSeparator)
root := filepath.Dir(os.Args[0])
ExecPath, _ = filepath.Abs(root)
if strings.Contains(ExecPath, "/T/") {
ExecPath, _ = os.Getwd()
}
length := utf8.RuneCountInString(ExecPath)
lastChar := ExecPath[length-1:]
if lastChar != sep {
Expand Down
10 changes: 6 additions & 4 deletions core/article.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (article *Article) Save(db *gorm.DB) error {
article.CreatedTime = int(time.Now().Unix())
}

if err := db.Save(&article).Error; err != nil {
if err := db.Save(article).Error; err != nil {
return err
}
articleData := ArticleData{
Expand All @@ -58,16 +58,18 @@ func (article *Article) Save(db *gorm.DB) error {

func (article *Article) Delete() error {
db := services.DB
if err := db.Delete(&article).Error; err != nil {
if err := db.Delete(article).Error; err != nil {
return err
}

db.Where("id = ?", article.Id).Delete(ArticleData{})

return nil
}

func (source *ArticleSource) Save() error {
db := services.DB
if err := db.Save(&source).Error; err != nil {
if err := db.Save(source).Error; err != nil {
return err
}

Expand All @@ -76,7 +78,7 @@ func (source *ArticleSource) Save() error {

func (source *ArticleSource) Delete() error {
db := services.DB
if err := db.Delete(&source).Error; err != nil {
if err := db.Delete(source).Error; err != nil {
return err
}

Expand Down
58 changes: 40 additions & 18 deletions core/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"collector/config"
"collector/library"
"collector/services"
"crypto/tls"
"fmt"
"github.com/Chain-Zhang/pinyin"
"github.com/PuerkitoBio/goquery"
Expand Down Expand Up @@ -86,7 +87,7 @@ func CollectListTask() {
for _, v := range articleSources {
//ch <- fmt.Sprintf("%d", i)
//waitGroup.Add(1)
GetArticleLinks(v)
getArticleLinks(v)
}

//waitGroup.Wait()
Expand All @@ -105,13 +106,21 @@ func CollectDetailTask() {
for _, vv := range articleList {
ch <- vv.OriginUrl
waitGroup.Add(1)
go GetArticleDetail(vv)
go getArticleDetail(vv)
}

waitGroup.Wait()
}

func GetArticleLinks(v ArticleSource) {
func getArticleLinks(v ArticleSource) {
//defer func() {
// waitGroup.Done()
// <-ch
//}()
GetArticleLinks(&v)
}

func GetArticleLinks(v *ArticleSource) {
//defer func() {
// waitGroup.Done()
// <-ch
Expand All @@ -128,21 +137,25 @@ func GetArticleLinks(v ArticleSource) {
db.Model(Article{}).Where(Article{OriginUrl: article.OriginUrl}).FirstOrCreate(&article)
}
} else {
db.Model(&v).Update("error_times", v.ErrorTimes+1)
db.Model(v).Update("error_times", v.ErrorTimes+1)
}
}

func GetArticleDetail(v Article) {
func getArticleDetail(v Article) {
defer func() {
waitGroup.Done()
<-ch
}()

GetArticleDetail(&v)
}

func GetArticleDetail(v *Article) {
db := services.DB
//标记当前为执行中
db.Model(Article{}).Where("`id` = ?", v.Id).Update("status", 2)

_ = CollectDetail(&v)
_ = CollectDetail(v)

//更新到数据库中
status := int(1)
Expand Down Expand Up @@ -188,7 +201,7 @@ func GetArticleDetail(v Article) {
fmt.Println(status, v.Title, v.OriginUrl)
article.Save(db)

AutoPublish(&article)
AutoPublish(article)
}

func AutoPublish(article *Article) {
Expand Down Expand Up @@ -622,7 +635,7 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) {
contentLength := 0

//对一些固定的内容,直接获取值
contentItems := doc.Find("UCAPCONTENT,#mainText,.article-content,#article-content,#articleContnet,.entry-content,.the_body,.rich_media_content,#js_content,.word_content,.pages_content,.wendang_content,#content")
contentItems := doc.Find("UCAPCONTENT,#mainText,.article-content,#article-content,#articleContnet,.entry-content,.the_body,.rich_media_content,#js_content,.word_content,.pages_content,.wendang_content,#content,.RichText,.markdown-section")
if contentItems.Length() > 0 {
for i := range contentItems.Nodes {
contentItem := contentItems.Eq(i)
Expand All @@ -641,7 +654,16 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) {
if curLen < config.CollectorConfig.ContentMinLength {
contentText = ""
}
aCount := contentItem.Find("a").Length()
aCount := 0
aLinks := contentItem.Find("a")
if aLinks.Length() > 0 {
for i := range aLinks.Nodes {
href, exist := aLinks.Eq(i).Attr("href")
if exist && href != "" && !strings.HasPrefix(href, "#") {
aCount++
}
}
}
if aCount > 5 {
//太多连接了,直接放弃该内容
contentText = ""
Expand All @@ -652,16 +674,16 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) {
if divs.Length() > 0 {
for i := range divs.Nodes {
div := divs.Eq(i)
if (div.Find("div").Length() == 0 || utf8.RuneCountInString(div.Find("div").Text()) < 100) && utf8.RuneCountInString(div.Text()) >= config.CollectorConfig.ContentMinLength {
if (div.Find("div").Length() == 0 || utf8.RuneCountInString(div.Find("div").Text()) < 100) && div.ChildrenFiltered("p").Length() > 0 && utf8.RuneCountInString(div.Text()) >= config.CollectorConfig.ContentMinLength {
contentItem = div
break
}
}
}
//排除一些不对的标签
otherLength := contentItem.Find("input,textarea,form,button,footer,.footer").Length()
if otherLength > 0 {
contentText = ""
otherItems := contentItem.Find("input,textarea,form,button,footer,.footer")
if otherItems.Length() > 0 {
otherItems.Remove()
}
contentItem.Find("h1").Remove()
//根据规则过滤
Expand All @@ -686,7 +708,7 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) {
if contentText == "" {
content = ""
//通用的获取方法
divs := doc.Find("div")
divs := doc.Find("div,article")
for i := range divs.Nodes {
item := divs.Eq(i)
pCount := item.ChildrenFiltered("p").Length()
Expand Down Expand Up @@ -798,7 +820,7 @@ func (article *Article) ReplaceHref(src string) string {
* 请求域名返回数据
*/
func Request(urlPath string) (*RequestData, error) {
resp, body, errs := gorequest.New().Timeout(90 * time.Second).Get(urlPath).End()
resp, body, errs := gorequest.New().TLSClientConfig(&tls.Config{ InsecureSkipVerify: true}).Timeout(90 * time.Second).AppendHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36").Get(urlPath).End()
if len(errs) > 0 {
//如果是https,则尝试退回http请求
if strings.HasPrefix(urlPath, "https") {
Expand Down Expand Up @@ -908,11 +930,11 @@ func HasContain(need string, needArray []string) bool {
}

func GetKeywords(content string, num int) []string {
lenth := 2
keywords := keyword.Extractor.Extract(content, 1000)
var words []string
length := 2
keywords := keyword.Extractor.Extract(content, 1000)
for _, v := range keywords {
if utf8.RuneCountInString(v) >= lenth {
if utf8.RuneCountInString(v) >= length {
words = append(words, v)
}
}
Expand Down
Loading

0 comments on commit 811050c

Please sign in to comment.