diff --git a/pkg/parser/common.go b/pkg/parser/common.go new file mode 100644 index 0000000..a684356 --- /dev/null +++ b/pkg/parser/common.go @@ -0,0 +1,71 @@ +package parser + +import ( + "bytes" + "fmt" + "time" +) + +const CommonLogFormat = "02/Jan/2006:15:04:05 -0700" + +func clfDateParse(s []byte) time.Time { + return clfDateParseString(string(s)) +} + +func clfDateParseString(s string) time.Time { + t, _ := time.Parse(CommonLogFormat, s) + return t +} + +// Nginx escapes `"`, `\` to `\xXX` +// Apache esacpes `"`, `\` to `\"` `\\` +func findEndingDoubleQuote(data []byte) int { + inEscape := false + for i := 0; i < len(data); i++ { + if inEscape { + inEscape = false + } else { + if data[i] == '\\' { + inEscape = true + } else if data[i] == '"' { + return i + } + } + } + return -1 +} + +func splitFields(line []byte) ([][]byte, error) { + res := make([][]byte, 0, 16) +loop: + for baseIdx := 0; baseIdx < len(line); { + switch line[baseIdx] { + case '"': + quoteIdx := findEndingDoubleQuote(line[baseIdx+1:]) + if quoteIdx == -1 { + return res, fmt.Errorf("unexpected format: unbalanced quotes [ at %d", baseIdx) + } + res = append(res, line[baseIdx+1:baseIdx+quoteIdx+1]) + baseIdx += quoteIdx + 2 + case '[': + closingIdx := bytes.IndexByte(line[baseIdx+1:], ']') + if closingIdx == -1 { + return res, fmt.Errorf("unexpected format: unmatched [ at %d", baseIdx) + } + res = append(res, line[baseIdx+1:baseIdx+closingIdx+1]) + baseIdx += closingIdx + 2 + default: + spaceIdx := bytes.IndexByte(line[baseIdx:], ' ') + if spaceIdx == -1 { + res = append(res, line[baseIdx:]) + break loop + } + res = append(res, line[baseIdx:baseIdx+spaceIdx]) + baseIdx += spaceIdx + } + if baseIdx < len(line) && line[baseIdx] == ' ' { + baseIdx++ + } + } + return res, nil +} diff --git a/pkg/parser/nginx-combined.go b/pkg/parser/nginx-combined.go index c14a12c..2dde257 100644 --- a/pkg/parser/nginx-combined.go +++ b/pkg/parser/nginx-combined.go @@ -6,7 +6,6 @@ import ( "fmt" "regexp" "strconv" - "time" ) func init() { @@ -37,75 +36,24 @@ func init() { }) } -const CommonLogFormat = "02/Jan/2006:15:04:05 -0700" - -func clfDateParse(s []byte) time.Time { - return clfDateParseString(string(s)) -} - -func clfDateParseString(s string) time.Time { - t, _ := time.Parse(CommonLogFormat, s) - return t -} - -// Nginx escapes `"`, `\` to `\xXX` -// Apache esacpes `"`, `\` to `\"` `\\` -func findEndingDoubleQuote(data []byte) int { - inEscape := false - for i := 0; i < len(data); i++ { - if inEscape { - inEscape = false - } else { - if data[i] == '\\' { - inEscape = true - } else if data[i] == '"' { - return i - } - } +func ParseNginxCombined(line []byte) (logItem LogItem, err error) { + fields, err := splitFields(line) + if err != nil { + return logItem, err } - return -1 -} - -func ParseNginxCombined(line []byte) (LogItem, error) { - baseIdx := 0 - // get the first - - delimIndex := bytes.IndexByte(line, '-') - if delimIndex == -1 { - return LogItem{}, errors.New("unexpected format: no - (empty identity)") + if len(fields) != 9 { + return logItem, fmt.Errorf("invalid format: expected 9 fields, got %d", len(fields)) } - clientIP := line[:delimIndex-1] - baseIdx = delimIndex + 1 - // get time within [$time_local] - leftBracketIndex := bytes.IndexByte(line[baseIdx:], '[') - if leftBracketIndex == -1 { - return LogItem{}, errors.New("unexpected format: no [ (datetime)") - } - rightBracketIndex := bytes.IndexByte(line[baseIdx+leftBracketIndex+1:], ']') - if rightBracketIndex == -1 { - return LogItem{}, errors.New("unexpected format: no ] (datetime)") + if string(fields[1]) != "-" { + return logItem, errors.New("unexpected format: no - (empty identity)") } - localTimeByte := line[baseIdx+leftBracketIndex+1 : baseIdx+leftBracketIndex+rightBracketIndex+1] - // localTime, err := time.Parse("02/Jan/2006:15:04:05 -0700", string(localTimeByte)) - // if err != nil { - // return LogItem{}, err - // } - localTime := clfDateParse(localTimeByte) - baseIdx += leftBracketIndex + rightBracketIndex + 2 + logItem.Client = string(fields[0]) + logItem.Time = clfDateParse(fields[3]) - // get URL within first "$request" - leftQuoteIndex := bytes.IndexByte(line[baseIdx:], '"') - if leftQuoteIndex == -1 { - return LogItem{}, errors.New("unexpected format: no \" (request)") - } - rightQuoteIndex := findEndingDoubleQuote(line[baseIdx+leftQuoteIndex+1:]) - if rightQuoteIndex == -1 { - return LogItem{}, errors.New("unexpected format: no \" after first \" (request)") - } - - url := line[baseIdx+leftQuoteIndex+1 : baseIdx+leftQuoteIndex+rightQuoteIndex+1] - baseIdx += leftQuoteIndex + rightQuoteIndex + 2 + requestLine := fields[4] + url := requestLine // strip HTTP method in url spaceIndex := bytes.IndexByte(url, ' ') if spaceIndex == -1 { @@ -121,51 +69,16 @@ func ParseNginxCombined(line []byte) (LogItem, error) { } else { url = url[:spaceIndex] } + logItem.URL = string(url) - // get size ($body_bytes_sent) - baseIdx += 1 - leftSpaceIndex := bytes.IndexByte(line[baseIdx:], ' ') - if leftSpaceIndex == -1 { - return LogItem{}, errors.New("unexpected format: no space after $request (code)") - } - rightSpaceIndex := bytes.IndexByte(line[baseIdx+leftSpaceIndex+1:], ' ') - if rightSpaceIndex == -1 { - return LogItem{}, errors.New("unexpected format: no space after $body_bytes_sent (size)") - } - sizeBytes := line[baseIdx+leftSpaceIndex+1 : baseIdx+leftSpaceIndex+rightSpaceIndex+1] - size, err := strconv.ParseUint(string(sizeBytes), 10, 64) + sizeBytes := fields[6] + logItem.Size, err = strconv.ParseUint(string(sizeBytes), 10, 64) if err != nil { - return LogItem{}, err + return logItem, err } - baseIdx += leftSpaceIndex + rightSpaceIndex + 2 - // skip referer - leftQuoteIndex = bytes.IndexByte(line[baseIdx:], '"') - if leftQuoteIndex == -1 { - return LogItem{}, errors.New("unexpected format: no \" (referer)") - } - rightQuoteIndex = findEndingDoubleQuote(line[baseIdx+leftQuoteIndex+1:]) - if rightQuoteIndex == -1 { - return LogItem{}, errors.New("unexpected format: no \" after first \" (referer)") - } - baseIdx += 1 + leftQuoteIndex + rightQuoteIndex + 2 - // get UA - leftQuoteIndex = bytes.IndexByte(line[baseIdx:], '"') - if leftQuoteIndex == -1 { - return LogItem{}, errors.New("unexpected format: no \" (user-agent)") - } - rightQuoteIndex = findEndingDoubleQuote(line[baseIdx+leftQuoteIndex+1:]) - if rightQuoteIndex == -1 { - return LogItem{}, errors.New("unexpected format: no \" after first \" (user-agent)") - } - userAgent := line[baseIdx+leftQuoteIndex+1 : baseIdx+leftQuoteIndex+rightQuoteIndex+1] - return LogItem{ - Size: size, - Client: string(clientIP), - Time: localTime, - URL: string(url), - Useragent: string(userAgent), - }, nil + logItem.Useragent = string(fields[8]) + return } var nginxCombinedRe = regexp.MustCompile( diff --git a/pkg/parser/tencent-cdn.go b/pkg/parser/tencent-cdn.go index fbcb3e3..d11711b 100644 --- a/pkg/parser/tencent-cdn.go +++ b/pkg/parser/tencent-cdn.go @@ -1,8 +1,6 @@ package parser import ( - "bytes" - "errors" "fmt" "strconv" "time" @@ -34,32 +32,6 @@ func compactDateTimeParseString(s string) time.Time { return t } -func splitFields(line []byte) ([][]byte, error) { - res := make([][]byte, 0, 16) - for baseIdx := 0; baseIdx < len(line); { - if line[baseIdx] == '"' { - quoteIdx := findEndingDoubleQuote(line[baseIdx+1:]) - if quoteIdx == -1 { - return res, errors.New("unexpected format: unbalanced quotes") - } - res = append(res, line[baseIdx+1:baseIdx+quoteIdx+1]) - baseIdx += quoteIdx + 2 - if line[baseIdx] == ' ' { - baseIdx++ - } - } else { - spaceIdx := bytes.IndexByte(line[baseIdx:], ' ') - if spaceIdx == -1 { - res = append(res, line[baseIdx:]) - break - } - res = append(res, line[baseIdx:baseIdx+spaceIdx]) - baseIdx += spaceIdx + 1 - } - } - return res, nil -} - func ParseTencentCDN(line []byte) (logItem LogItem, err error) { fields, err := splitFields(line) if err != nil {