Skip to content

Commit

Permalink
changed project structure
Browse files Browse the repository at this point in the history
  • Loading branch information
demyanovs committed Jul 9, 2024
1 parent 7eb0fbf commit e84356c
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 60 deletions.
37 changes: 19 additions & 18 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ import (
"net/url"
"time"

"github.com/demyanovs/robotstxt"
_ "golang.org/x/lint"

"github.com/demyanovs/robotstxt"
"github.com/demyanovs/urlcrawler/utils"
"github.com/demyanovs/urlcrawler/queue"
"github.com/demyanovs/urlcrawler/report"
)

const (
Expand All @@ -28,13 +29,13 @@ var supportedOutputs = []string{outputCSV, outputJSON}
func main() {
startURL := flag.String("u", "", "Start url (required)")
output := flag.String("output", outputCSV, "Output format (csv, json)")
outputFile := flag.String("output-file", "", "File path to save report")
outputFile := flag.String("output-file", "", "File path to save r")
delay := flag.Int("delay", 1000, "Delay between requests in milliseconds")
depth := flag.Int("depth", 0, "Depth of the crawl (0 - infinite")
limitURLs := flag.Int("limit", 0, "Limit of URLs to crawl (0 - unlimited")
reqTimeout := flag.Int("timeout", 5000, "Request timeout in milliseconds")
bulkSize := flag.Int("bulk-size", 30, "Bulk size for saving to the file")
queueLen := flag.Int("queue-len", 50, "Queue length")
queueLen := flag.Int("q-len", 50, "Queue length")
quietMode := flag.Bool("q", false, "Quiet mode (no logs")
ignoreRobotsTXT := flag.Bool("ignore-robots", false, "Ignore crawl-delay and disallowed URLs from robots.txt")

Expand All @@ -50,10 +51,10 @@ func main() {

logger := log.New(log.Writer(), "", log.Ldate|log.Ltime)

report, reportFile := getReport(*output, *outputFile)
r, reportFile := getReport(*output, *outputFile)

queue, err := utils.NewQueue(
utils.ConfigType{
q, err := queue.New(
queue.ConfigType{
QueueLen: *queueLen,
LimitURLs: *limitURLs,
ReqTimeout: time.Duration(*reqTimeout) * time.Millisecond,
Expand All @@ -63,7 +64,7 @@ func main() {
Depth: *depth,
},
*startURL,
report,
r,
logger,
nil,
)
Expand All @@ -81,15 +82,15 @@ func main() {
log.Fatal(err)
}

queue.RobotsData = robots
q.RobotsData = robots

crawlDelay, err := robots.GetCrawlDelay("*")
if err != nil {
log.Fatal(err)
}

if crawlDelay != nil {
queue.Config.Delay = time.Duration(*crawlDelay) * time.Second
q.Config.Delay = time.Duration(*crawlDelay) * time.Second
if *quietMode == false {
logger.Printf("found crawl-delay in robots.txt: %ds. Ignoring delay from the config\n", *crawlDelay)
}
Expand All @@ -101,10 +102,10 @@ func main() {
}

if *quietMode == false {
printConfig(queue, *output, reportFile, *ignoreRobotsTXT, logger)
printConfig(q, *output, reportFile, *ignoreRobotsTXT, logger)
}

queue.Start()
q.Start()
}

func getRobotsTXT(startURL string) (*robotstxt.RobotsData, error) {
Expand All @@ -128,24 +129,24 @@ func getRobotsTXT(startURL string) (*robotstxt.RobotsData, error) {
return robots, nil
}

func getReport(output string, outputFile string) (utils.Reporter, string) {
var report utils.Reporter
func getReport(output string, outputFile string) (queue.Reporter, string) {
var r queue.Reporter
if output == outputJSON {
if outputFile == "" {
outputFile = fmt.Sprintf("%s.%s", fileNameDefault, outputJSON)
}
report = utils.NewJSONReport(outputFile)
r = report.NewJSONReport(outputFile)
} else {
if outputFile == "" {
outputFile = fmt.Sprintf("%s.%s", fileNameDefault, outputCSV)
}
report = utils.NewCSVReport(outputFile)
r = report.NewCSVReport(outputFile)
}

return report, outputFile
return r, outputFile
}

func printConfig(queue *utils.Queue, output string, outputFile string, ignoreRobotsTXT bool, logger *log.Logger) {
func printConfig(queue *queue.Queue, output string, outputFile string, ignoreRobotsTXT bool, logger *log.Logger) {
logger.Printf(
"Starting crawling, "+
"delay: %dms, "+
Expand Down
6 changes: 3 additions & 3 deletions utils/parser.go → parser/parser.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package utils
package parser

import (
"fmt"
Expand Down Expand Up @@ -32,8 +32,8 @@ type Parser struct {
Client http.Client
}

// NewParser creates a new Parser.
func NewParser() Parser {
// New creates a new Parser.
func New() Parser {
return Parser{
//Client: client,
}
Expand Down
6 changes: 3 additions & 3 deletions utils/parser_test.go → parser/parser_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package utils
package parser

import (
"io"
Expand Down Expand Up @@ -26,7 +26,7 @@ func TestParseURL_WrongStatusCodeError(t *testing.T) {
},
}

parser := NewParser()
parser := New()
_, _, err := parser.ParseResponse(&resp)

require.Error(t, err)
Expand All @@ -45,7 +45,7 @@ func TestParseURL_Success(t *testing.T) {
},
}

parser := NewParser()
parser := New()
pageData, linksOnPage, err := parser.ParseResponse(&resp)

require.NoError(t, err)
Expand Down
34 changes: 18 additions & 16 deletions utils/queue.go → queue/queue.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package utils
package queue

import (
"context"
"fmt"
"github.com/demyanovs/urlcrawler/parser"
"github.com/demyanovs/urlcrawler/store"
"log"
"net/http"
"net/url"
Expand All @@ -16,7 +18,7 @@ type Queue struct {
startURL *url.URL
report Reporter
RobotsData RobotsData
parser Parser
parser parser.Parser
logger Logger
startedAt time.Time
sURLsDone URLStore
Expand Down Expand Up @@ -56,16 +58,16 @@ type RobotsData interface {

// Reporter represents a reporter.
type Reporter interface {
SaveBulk(records []PageData) error
SaveBulk(records []parser.PageData) error
}

// Logger represents a logger.
type Logger interface {
Println(v ...any)
}

// NewQueue creates a new queue.
func NewQueue(
// New creates a new queue.
func New(
config ConfigType,
startURL string,
report Reporter,
Expand All @@ -77,20 +79,20 @@ func NewQueue(
return nil, err
}

sURLsToDo := NewStore()
sURLsToDo := store.New()
sURLsToDo.Add(startURL, 0)

return &Queue{
Config: config,
startURL: parsedURL,
report: report,
RobotsData: robotsData,
parser: NewParser(),
parser: parser.New(),
logger: logger,
sURLsDone: NewStore(),
sURLsDone: store.New(),
sURLsToDo: sURLsToDo,
sURLsInProgress: NewStore(),
sURLsToSave: NewStore(),
sURLsInProgress: store.New(),
sURLsToSave: store.New(),
}, nil
}

Expand Down Expand Up @@ -157,13 +159,13 @@ func (q *Queue) process(queue chan struct{}, wg *sync.WaitGroup, URL string, dep
ctx, cancel := context.WithTimeout(context.Background(), q.Config.ReqTimeout)
defer cancel()

var pageData PageData
var pageData parser.PageData
var linksOnPage []string

// Start processing
resp, err := q.readURL(ctx, URL)
if err != nil {
pageData = PageData{
pageData = parser.PageData{
URL: URL,
}
fmt.Println(fmt.Errorf("can't send request to url %s. Error: %s", URL, err))
Expand Down Expand Up @@ -197,12 +199,12 @@ func (q *Queue) process(queue chan struct{}, wg *sync.WaitGroup, URL string, dep
}()
}

func (q *Queue) toPagesData(data []any) PagesData {
var pagesData PagesData
func (q *Queue) toPagesData(data []any) parser.PagesData {
var pagesData parser.PagesData
for _, d := range data {
switch d.(type) {
case PageData:
pagesData = append(pagesData, d.(PageData))
case parser.PageData:
pagesData = append(pagesData, d.(parser.PageData))
}
}

Expand Down
5 changes: 3 additions & 2 deletions utils/report_csv.go → report/report_csv.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package utils
package report

import (
"encoding/csv"
"github.com/demyanovs/urlcrawler/parser"
"log"
"os"
"strconv"
Expand All @@ -24,7 +25,7 @@ func NewCSVReport(filePath string) *CSVReport {
}

// SaveBulk saves multiple records to the file.
func (r *CSVReport) SaveBulk(records []PageData) error {
func (r *CSVReport) SaveBulk(records []parser.PageData) error {
if r.firstInsert == true {
err := r.addHeader()
if err != nil {
Expand Down
5 changes: 3 additions & 2 deletions utils/report_csv_test.go → report/report_csv_test.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
package utils
package report

import (
"encoding/csv"
"github.com/demyanovs/urlcrawler/parser"
"log"
"os"
"testing"

"github.com/stretchr/testify/require"
)

var records = PagesData{
var records = parser.PagesData{
{
URL: "https://en.wikipedia.org/wiki/Yuri_Gagarin",
StatusCode: 200,
Expand Down
7 changes: 4 additions & 3 deletions utils/report_json.go → report/report_json.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package utils
package report

import (
"encoding/json"
"github.com/demyanovs/urlcrawler/parser"
"os"
)

Expand All @@ -20,7 +21,7 @@ func NewJSONReport(filePath string) *JSONReport {
}

// SaveBulk saves multiple records to the file.
func (r *JSONReport) SaveBulk(records []PageData) error {
func (r *JSONReport) SaveBulk(records []parser.PageData) error {
file, err := os.OpenFile(r.filePath, os.O_CREATE|os.O_RDWR, 0644)
defer file.Close()

Expand All @@ -40,7 +41,7 @@ func (r *JSONReport) SaveBulk(records []PageData) error {

decoder := json.NewDecoder(file)

var data []PageData
var data []parser.PageData
for decoder.More() {
err = decoder.Decode(&data)
if err != nil {
Expand Down
7 changes: 4 additions & 3 deletions utils/report_json_test.go → report/report_json_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package utils
package report

import (
"encoding/json"
"github.com/demyanovs/urlcrawler/parser"
"io"
"log"
"os"
Expand All @@ -10,7 +11,7 @@ import (
"github.com/stretchr/testify/require"
)

var data = PagesData{
var data = parser.PagesData{
{
URL: "https://en.wikipedia.org/wiki/Yuri_Gagarin",
StatusCode: 200,
Expand Down Expand Up @@ -44,7 +45,7 @@ func TestSaveBulkJSON_WithHeaderSuccess(t *testing.T) {

defer os.Remove(filePath)

var parsedData PagesData
var parsedData parser.PagesData
f, err := os.Open(filePath)
if err != nil {
log.Println(err)
Expand Down
6 changes: 3 additions & 3 deletions utils/store.go → store/store.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package utils
package store

import (
"errors"
Expand All @@ -14,8 +14,8 @@ type Store struct {
m map[string]any
}

// NewStore creates a new store.
func NewStore() *Store {
// New creates a new store.
func New() *Store {
return &Store{
m: make(map[string]any),
}
Expand Down
Loading

0 comments on commit e84356c

Please sign in to comment.