diff --git a/.gitignore b/.gitignore index ef3cbc9..96121d8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ /vendor /config.json /collector* -*packr.go \ No newline at end of file +/test \ No newline at end of file diff --git a/app/handler/article.go b/app/handler/article.go index 6fced30..fa5fd8b 100644 --- a/app/handler/article.go +++ b/app/handler/article.go @@ -176,6 +176,8 @@ func ArticleSourceSaveApi(ctx iris.Context) { }) return } + //添加完,马上抓取 + core.GetArticleLinks(source) ctx.JSON(iris.Map{ "code": config.StatusOK, @@ -183,3 +185,84 @@ func ArticleSourceSaveApi(ctx iris.Context) { "data": source, }) } + +func ArticlePublishApi(ctx iris.Context) { + var req request.Article + if err := ctx.ReadForm(&req); err != nil { + ctx.JSON(iris.Map{ + "code": config.StatusFailed, + "msg": err.Error(), + }) + return + } + + article, err := provider.GetArticleById(req.ID) + if err != nil { + ctx.JSON(iris.Map{ + "code": config.StatusFailed, + "msg": err.Error(), + }) + return + } + + core.AutoPublish(article) + + ctx.JSON(iris.Map{ + "code": config.StatusOK, + "msg": "删除成功", + }) +} + +func ArticleCatchApi(ctx iris.Context) { + var req request.Article + if err := ctx.ReadForm(&req); err != nil { + ctx.JSON(iris.Map{ + "code": config.StatusFailed, + "msg": err.Error(), + }) + return + } + + article, err := provider.GetArticleById(req.ID) + if err != nil { + ctx.JSON(iris.Map{ + "code": config.StatusFailed, + "msg": err.Error(), + }) + return + } + + go core.GetArticleDetail(article) + + ctx.JSON(iris.Map{ + "code": config.StatusOK, + "msg": "抓取任务已执行", + }) +} + +func ArticleSourceCatchApi(ctx iris.Context) { + var req request.ArticleSource + if err := ctx.ReadForm(&req); err != nil { + ctx.JSON(iris.Map{ + "code": config.StatusFailed, + "msg": err.Error(), + }) + return + } + + source, err := provider.GetArticleSourceById(req.ID) + if err != nil { + ctx.JSON(iris.Map{ + "code": config.StatusFailed, + "msg": err.Error(), + }) + return + } + + go core.GetArticleLinks(source) + + ctx.JSON(iris.Map{ + "code": config.StatusOK, + "msg": "抓取任务执行", + }) +} \ No newline at end of file diff --git a/app/route/base.go b/app/route/base.go index d106f88..e799d12 100644 --- a/app/route/base.go +++ b/app/route/base.go @@ -28,16 +28,22 @@ func Register(app *iris.Application) { app.Post("/setting", handler.InspectJson, handler.DefaultSettingForm) app.Post("/publish", handler.InspectJson, handler.PublishSettingForm) - app.Get("/api/index/echarts", handler.InspectJson, handler.IndexEchartsApi) - - app.Get("/api/article/list", handler.InspectJson, handler.ArticleListApi) - app.Post("/api/article/delete", handler.InspectJson, handler.ArticleDeleteApi) - - app.Get("/api/article/source/list", handler.InspectJson, handler.ArticleSourceListApi) - app.Post("/api/article/source/delete", handler.InspectJson, handler.ArticleSourceDeleteApi) - app.Post("/api/article/source/save", handler.InspectJson, handler.ArticleSourceSaveApi) - app.Get("/api/setting", handler.InspectJson, handler.DefaultSettingApi) - app.Get("/api/publish", handler.InspectJson, handler.PublishSettingApi) + api := app.Party("/api", handler.InspectJson) + { + api.Get("/index/echarts", handler.IndexEchartsApi) + + api.Get("/article/list", handler.ArticleListApi) + api.Post("/article/delete", handler.ArticleDeleteApi) + api.Post("/article/publish", handler.ArticlePublishApi) + api.Post("/article/catch", handler.ArticleCatchApi) + + api.Get("/article/source/list", handler.ArticleSourceListApi) + api.Post("/article/source/delete", handler.ArticleSourceDeleteApi) + api.Post("/article/source/save", handler.ArticleSourceSaveApi) + api.Post("/article/source/catch", handler.ArticleSourceCatchApi) + api.Get("/setting", handler.DefaultSettingApi) + api.Get("/publish", handler.PublishSettingApi) + } } func Cors(ctx iris.Context) { diff --git a/config/config.go b/config/config.go index 7507bbf..f7fc444 100644 --- a/config/config.go +++ b/config/config.go @@ -9,6 +9,7 @@ import ( "os" "path/filepath" "regexp" + "strings" "unicode/utf8" ) @@ -25,6 +26,9 @@ func InitJSON() { sep := string(os.PathSeparator) root := filepath.Dir(os.Args[0]) ExecPath, _ = filepath.Abs(root) + if strings.Contains(ExecPath, "/T/") { + ExecPath, _ = os.Getwd() + } length := utf8.RuneCountInString(ExecPath) lastChar := ExecPath[length-1:] if lastChar != sep { diff --git a/core/article.go b/core/article.go index da3cb8e..c0571de 100644 --- a/core/article.go +++ b/core/article.go @@ -44,7 +44,7 @@ func (article *Article) Save(db *gorm.DB) error { article.CreatedTime = int(time.Now().Unix()) } - if err := db.Save(&article).Error; err != nil { + if err := db.Save(article).Error; err != nil { return err } articleData := ArticleData{ @@ -58,16 +58,18 @@ func (article *Article) Save(db *gorm.DB) error { func (article *Article) Delete() error { db := services.DB - if err := db.Delete(&article).Error; err != nil { + if err := db.Delete(article).Error; err != nil { return err } + db.Where("id = ?", article.Id).Delete(ArticleData{}) + return nil } func (source *ArticleSource) Save() error { db := services.DB - if err := db.Save(&source).Error; err != nil { + if err := db.Save(source).Error; err != nil { return err } @@ -76,7 +78,7 @@ func (source *ArticleSource) Save() error { func (source *ArticleSource) Delete() error { db := services.DB - if err := db.Delete(&source).Error; err != nil { + if err := db.Delete(source).Error; err != nil { return err } diff --git a/core/collector.go b/core/collector.go index a19ad2a..057c99b 100644 --- a/core/collector.go +++ b/core/collector.go @@ -4,6 +4,7 @@ import ( "collector/config" "collector/library" "collector/services" + "crypto/tls" "fmt" "github.com/Chain-Zhang/pinyin" "github.com/PuerkitoBio/goquery" @@ -86,7 +87,7 @@ func CollectListTask() { for _, v := range articleSources { //ch <- fmt.Sprintf("%d", i) //waitGroup.Add(1) - GetArticleLinks(v) + getArticleLinks(v) } //waitGroup.Wait() @@ -105,13 +106,21 @@ func CollectDetailTask() { for _, vv := range articleList { ch <- vv.OriginUrl waitGroup.Add(1) - go GetArticleDetail(vv) + go getArticleDetail(vv) } waitGroup.Wait() } -func GetArticleLinks(v ArticleSource) { +func getArticleLinks(v ArticleSource) { + //defer func() { + // waitGroup.Done() + // <-ch + //}() + GetArticleLinks(&v) +} + +func GetArticleLinks(v *ArticleSource) { //defer func() { // waitGroup.Done() // <-ch @@ -128,21 +137,25 @@ func GetArticleLinks(v ArticleSource) { db.Model(Article{}).Where(Article{OriginUrl: article.OriginUrl}).FirstOrCreate(&article) } } else { - db.Model(&v).Update("error_times", v.ErrorTimes+1) + db.Model(v).Update("error_times", v.ErrorTimes+1) } } -func GetArticleDetail(v Article) { +func getArticleDetail(v Article) { defer func() { waitGroup.Done() <-ch }() + GetArticleDetail(&v) +} + +func GetArticleDetail(v *Article) { db := services.DB //标记当前为执行中 db.Model(Article{}).Where("`id` = ?", v.Id).Update("status", 2) - _ = CollectDetail(&v) + _ = CollectDetail(v) //更新到数据库中 status := int(1) @@ -188,7 +201,7 @@ func GetArticleDetail(v Article) { fmt.Println(status, v.Title, v.OriginUrl) article.Save(db) - AutoPublish(&article) + AutoPublish(article) } func AutoPublish(article *Article) { @@ -622,7 +635,7 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) { contentLength := 0 //对一些固定的内容,直接获取值 - contentItems := doc.Find("UCAPCONTENT,#mainText,.article-content,#article-content,#articleContnet,.entry-content,.the_body,.rich_media_content,#js_content,.word_content,.pages_content,.wendang_content,#content") + contentItems := doc.Find("UCAPCONTENT,#mainText,.article-content,#article-content,#articleContnet,.entry-content,.the_body,.rich_media_content,#js_content,.word_content,.pages_content,.wendang_content,#content,.RichText,.markdown-section") if contentItems.Length() > 0 { for i := range contentItems.Nodes { contentItem := contentItems.Eq(i) @@ -641,7 +654,16 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) { if curLen < config.CollectorConfig.ContentMinLength { contentText = "" } - aCount := contentItem.Find("a").Length() + aCount := 0 + aLinks := contentItem.Find("a") + if aLinks.Length() > 0 { + for i := range aLinks.Nodes { + href, exist := aLinks.Eq(i).Attr("href") + if exist && href != "" && !strings.HasPrefix(href, "#") { + aCount++ + } + } + } if aCount > 5 { //太多连接了,直接放弃该内容 contentText = "" @@ -652,16 +674,16 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) { if divs.Length() > 0 { for i := range divs.Nodes { div := divs.Eq(i) - if (div.Find("div").Length() == 0 || utf8.RuneCountInString(div.Find("div").Text()) < 100) && utf8.RuneCountInString(div.Text()) >= config.CollectorConfig.ContentMinLength { + if (div.Find("div").Length() == 0 || utf8.RuneCountInString(div.Find("div").Text()) < 100) && div.ChildrenFiltered("p").Length() > 0 && utf8.RuneCountInString(div.Text()) >= config.CollectorConfig.ContentMinLength { contentItem = div break } } } //排除一些不对的标签 - otherLength := contentItem.Find("input,textarea,form,button,footer,.footer").Length() - if otherLength > 0 { - contentText = "" + otherItems := contentItem.Find("input,textarea,form,button,footer,.footer") + if otherItems.Length() > 0 { + otherItems.Remove() } contentItem.Find("h1").Remove() //根据规则过滤 @@ -686,7 +708,7 @@ func (article *Article) ParseContent(doc *goquery.Document, body string) { if contentText == "" { content = "" //通用的获取方法 - divs := doc.Find("div") + divs := doc.Find("div,article") for i := range divs.Nodes { item := divs.Eq(i) pCount := item.ChildrenFiltered("p").Length() @@ -798,7 +820,7 @@ func (article *Article) ReplaceHref(src string) string { * 请求域名返回数据 */ func Request(urlPath string) (*RequestData, error) { - resp, body, errs := gorequest.New().Timeout(90 * time.Second).Get(urlPath).End() + resp, body, errs := gorequest.New().TLSClientConfig(&tls.Config{ InsecureSkipVerify: true}).Timeout(90 * time.Second).AppendHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36").Get(urlPath).End() if len(errs) > 0 { //如果是https,则尝试退回http请求 if strings.HasPrefix(urlPath, "https") { @@ -908,11 +930,11 @@ func HasContain(need string, needArray []string) bool { } func GetKeywords(content string, num int) []string { - lenth := 2 - keywords := keyword.Extractor.Extract(content, 1000) var words []string + length := 2 + keywords := keyword.Extractor.Extract(content, 1000) for _, v := range keywords { - if utf8.RuneCountInString(v) >= lenth { + if utf8.RuneCountInString(v) >= length { words = append(words, v) } } diff --git a/template/article/list.html b/template/article/list.html index ec4297e..e1ff37d 100644 --- a/template/article/list.html +++ b/template/article/list.html @@ -12,7 +12,9 @@