Skip to content

Commit

Permalink
Python log parser - handle limited size log content
Browse files Browse the repository at this point in the history
  • Loading branch information
attiasas committed Nov 20, 2023
1 parent 971cf8d commit 31d3204
Show file tree
Hide file tree
Showing 2 changed files with 222 additions and 48 deletions.
142 changes: 94 additions & 48 deletions utils/pythonutils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ const (
Pip PythonTool = "pip"
Pipenv PythonTool = "pipenv"
Poetry PythonTool = "poetry"

startDownloadingPattern = `^\s*Downloading\s`
downloadingCaptureGroup = `[^\s]*`
startUsingCachedPattern = `^\s*Using\scached\s`
usingCacheCaptureGroup = `[\S]+`
endPattern = `\s\(`
)

type PythonTool string
Expand Down Expand Up @@ -152,6 +158,50 @@ func getFilePath(srcPath, fileName string) (string, error) {
return filePath, nil
}

// Create the CmdOutputPattern objects that can capture group content that may span multiple lines for logs that have line size limitations.
// Since the log parser parse line by line, we need to create a parser that can capture group content that may span multiple lines.
func getMultilineSplitCaptureOutputPattern(startCollectingPattern, captureGroup, endCollectingPattern string, handler func(pattern *gofrogcmd.CmdOutputPattern) (string, error)) (parsers []*gofrogcmd.CmdOutputPattern) {
// Prepare regex patterns.
oneLineRegex := regexp.MustCompile(startCollectingPattern + `(` + captureGroup + `)` + endCollectingPattern)
startCollectionRegexp := regexp.MustCompile(startCollectingPattern)
endCollectionRegexp := regexp.MustCompile(endCollectingPattern)

// Create a parser for single line pattern matches.
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{RegExp: oneLineRegex, ExecFunc: handler})

// Create a parser for multi line pattern matches.
lineBuffer := ""
collectingMultiLineValue := false
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{RegExp: regexp.MustCompile(".*"), ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// Check if the line matches the startCollectingPattern.
if !collectingMultiLineValue && startCollectionRegexp.Match([]byte(pattern.Line)) {
// Start collecting lines.
collectingMultiLineValue = true
lineBuffer = pattern.Line
// We assume that the content is multiline so no need to check end at this point.
// Single line will be handled and matched by the other parser.
return pattern.Line, nil
}
if !collectingMultiLineValue {
return pattern.Line, nil
}
// Add the line content to the buffer.
lineBuffer += pattern.Line
// Check if the line matches the endCollectingPattern.
if endCollectionRegexp.Match([]byte(pattern.Line)) {
collectingMultiLineValue = false
// Simulate a one line content check to make sure we have regex match.
if oneLineRegex.Match([]byte(lineBuffer)) {
return handler(&gofrogcmd.CmdOutputPattern{Line: pattern.Line, MatchedResults: oneLineRegex.FindStringSubmatch(lineBuffer)})
}
}

return pattern.Line, nil
}})

return
}

func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log, srcPath string) (map[string]entities.Dependency, error) {
if tool == Pipenv {
// Add verbosity flag to pipenv commands to collect necessary data
Expand All @@ -161,19 +211,14 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,
installCmd.Dir = srcPath

dependenciesMap := map[string]entities.Dependency{}

// Create regular expressions for log parsing.
collectingRegexp := regexp.MustCompile(`^Collecting\s(\w[\w-.]+)`)
downloadingRegexp := regexp.MustCompile(`^\s*Downloading\s([^\s]*)\s\(`)
usingCachedRegexp := regexp.MustCompile(`^\s*Using\scached\s([\S]+)\s\(`)
alreadySatisfiedRegexp := regexp.MustCompile(`^Requirement\salready\ssatisfied:\s(\w[\w-.]+)`)
parsers := []*gofrogcmd.CmdOutputPattern{}

var packageName string
expectingPackageFilePath := false

// Extract downloaded package name.
dependencyNameParser := gofrogcmd.CmdOutputPattern{
RegExp: collectingRegexp,
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{
RegExp: regexp.MustCompile(`^Collecting\s(\w[\w-.]+)`),
ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// If this pattern matched a second time before downloaded-file-name was found, prompt a message.
if expectingPackageFilePath {
Expand All @@ -186,7 +231,7 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,
}

// Check for out of bound results.
if len(pattern.MatchedResults)-1 < 0 {
if len(pattern.MatchedResults)-1 <= 0 {
log.Debug(fmt.Sprintf("Failed extracting package name from line: %s", pattern.Line))
return pattern.Line, nil
}
Expand All @@ -197,49 +242,34 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,

return pattern.Line, nil
},
}

// Extract downloaded file, stored in Artifactory.
downloadedFileParser := gofrogcmd.CmdOutputPattern{
RegExp: downloadingRegexp,
ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// Check for out of bound results.
if len(pattern.MatchedResults)-1 < 0 {
log.Debug(fmt.Sprintf("Failed extracting download path from line: %s", pattern.Line))
return pattern.Line, nil
}

// If this pattern matched before package-name was found, do not collect this path.
if !expectingPackageFilePath {
log.Debug(fmt.Sprintf("Could not resolve package name for download path: %s , continuing...", packageName))
return pattern.Line, nil
}
})

// Save dependency information.
filePath := pattern.MatchedResults[1]
lastSlashIndex := strings.LastIndex(filePath, "/")
var fileName string
if lastSlashIndex == -1 {
fileName = filePath
} else {
fileName = filePath[lastSlashIndex+1:]
}
dependenciesMap[strings.ToLower(packageName)] = entities.Dependency{Id: fileName}
expectingPackageFilePath = false

log.Debug(fmt.Sprintf("Found package: %s installed with: %s", packageName, fileName))
saveCaptureGroupAsDependencyInfo := func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
fileName := extractFileNameFromRegexCaptureGroup(pattern)
if fileName == "" {
log.Debug(fmt.Sprintf("Failed extracting download path from line: %s", pattern.Line))
return pattern.Line, nil
},
}
// If this pattern matched before package-name was found, do not collect this path.
if !expectingPackageFilePath {
log.Debug(fmt.Sprintf("Could not resolve package name for download path: %s , continuing...", packageName))
return pattern.Line, nil
}
// Save dependency information.
dependenciesMap[strings.ToLower(packageName)] = entities.Dependency{Id: fileName}
expectingPackageFilePath = false
log.Debug(fmt.Sprintf("Found package: %s installed with: %s", packageName, fileName))
return pattern.Line, nil
}

cachedFileParser := gofrogcmd.CmdOutputPattern{
RegExp: usingCachedRegexp,
ExecFunc: downloadedFileParser.ExecFunc,
}
// Extract downloaded file, stored in Artifactory. (value at log may be split into multiple lines)
parsers = append(parsers, getMultilineSplitCaptureOutputPattern(startDownloadingPattern, downloadingCaptureGroup, endPattern, saveCaptureGroupAsDependencyInfo)...)
// Extract cached file, stored in Artifactory. (value at log may be split into multiple lines)
parsers = append(parsers, getMultilineSplitCaptureOutputPattern(startUsingCachedPattern, usingCacheCaptureGroup, endPattern, saveCaptureGroupAsDependencyInfo)...)

// Extract already installed packages names.
installedPackagesParser := gofrogcmd.CmdOutputPattern{
RegExp: alreadySatisfiedRegexp,
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{
RegExp: regexp.MustCompile(`^Requirement\salready\ssatisfied:\s(\w[\w-.]+)`),
ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// Check for out of bound results.
if len(pattern.MatchedResults)-1 < 0 {
Expand All @@ -252,12 +282,28 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,
log.Debug(fmt.Sprintf("Found package: %s already installed", pattern.MatchedResults[1]))
return pattern.Line, nil
},
}
})

// Execute command.
_, errorOut, _, err := gofrogcmd.RunCmdWithOutputParser(installCmd, true, &dependencyNameParser, &downloadedFileParser, &cachedFileParser, &installedPackagesParser)
_, errorOut, _, err := gofrogcmd.RunCmdWithOutputParser(installCmd, true, parsers...)
if err != nil {
return nil, fmt.Errorf("failed running %s command with error: '%s - %s'", string(tool), err.Error(), errorOut)
}
return dependenciesMap, nil
}

func extractFileNameFromRegexCaptureGroup(pattern *gofrogcmd.CmdOutputPattern) (fileName string) {
// Check for out of bound results (no captures).
if len(pattern.MatchedResults) <= 1 {
return ""
}
// Extract file information from capture group.
filePath := pattern.MatchedResults[1]
lastSlashIndex := strings.LastIndex(filePath, "/")
if lastSlashIndex == -1 {
fileName = filePath
} else {
fileName = filePath[lastSlashIndex+1:]
}
return
}
128 changes: 128 additions & 0 deletions utils/pythonutils/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package pythonutils

import (
"fmt"
"strings"
"testing"

gofrogcmd "github.com/jfrog/gofrog/io"
"github.com/stretchr/testify/assert"
)

func TestGetMultilineCaptureOutputPattern(t *testing.T) {
tests := []struct {
name string
text string
startCapturePattern string
captureGroupPattern string
endCapturePattern string
expectedCapture string
}{
{
name: "Using cached - single line captures",
startCapturePattern: startUsingCachedPattern,
captureGroupPattern: usingCacheCaptureGroup,
endCapturePattern: endPattern,
text: `
Looking in indexes:
***localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-1698829624/simple
Collecting pexpect==4.8.0 (from -r /tmp/pipenv-qzun2hd3-requirements/pipenv-o_899oue-hashed-reqs.txt (line 1))
Using cached http://localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-1698829624/packages/packages/39/7b/88dbb785881c28a102619d46423cb853b46dbccc70d3ac362d99773a78ce/pexpect-4.8.0-py2.py3-none-any.whl (59 kB)`,
expectedCapture: `pexpect-4.8.0-py2.py3-none-any.whl`,
},
{
name: "Using cached - multi line captures",
startCapturePattern: startUsingCachedPattern,
captureGroupPattern: usingCacheCaptureGroup,
endCapturePattern: endPattern,
text: `
Looking in indexes:
***localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-16
98829624/simple
Collecting pexpect==4.8.0 (from -r
/tmp/pipenv-qzun2hd3-requirements/pipenv-o_899oue-hashed-reqs.txt (line 1))
Using cached
http://localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-1698829624/pa
ckages/packages/39/7b/88dbb785881c28a102619d46423cb853b46dbccc70d3ac362d99773a78
ce/pexpect-4.8.0-py2.py3-none-any.whl (59 kB)`,
expectedCapture: `pexpect-4.8.0-py2.py3-none-any.whl`,
},
{
name: "Downloading - single line captures",
startCapturePattern: startDownloadingPattern,
captureGroupPattern: downloadingCaptureGroup,
endCapturePattern: endPattern,
text: ` Preparing metadata (pyproject.toml): finished with status 'done'
Collecting PyYAML==5.1.2 (from jfrog-python-example==1.0)
Downloading http://localhost:8081/artifactory/api/pypi/cli-pypi-virtual-1698829558/packages/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.0/265.0 kB 364.4 MB/s eta 0:00:00
Installing build dependencies: started`,
expectedCapture: `PyYAML-5.1.2.tar.gz`,
},
{
name: "Downloading - multi line captures",
startCapturePattern: startDownloadingPattern,
captureGroupPattern: downloadingCaptureGroup,
endCapturePattern: endPattern,
text: ` Preparing metadata (pyproject.toml): finished with status 'done'
Collecting PyYAML==5.1.2 (from jfrog-python-example==1.0)
Downloading http://localhost:8081/artifactory/api/pypi/cli-pypi-virtual-1698
829558/packages/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849
582fe/PyYAML-5.1.2.tar.gz (265 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.0/265.0 kB 364.4 MB/s eta 0:00:00
Installing build dependencies: started`,
expectedCapture: `PyYAML-5.1.2.tar.gz`,
},
}

for _, testCase := range tests {
t.Run(testCase.name, func(t *testing.T) {
aggFunc, captures := validateCaptures(testCase.expectedCapture)
runDummyTextStream(t, testCase.text, getMultilineSplitCaptureOutputPattern(
testCase.startCapturePattern,
testCase.captureGroupPattern,
testCase.endCapturePattern,
aggFunc,
))
if assert.Len(t, (*captures), 1, fmt.Sprintf("Expected 1 captured group, got size: %d", len(*captures))) {
assert.Equal(t, testCase.expectedCapture, (*captures)[0], fmt.Sprintf("Expected capture group: %s, got: %s", testCase.expectedCapture, (*captures)[0]))
}
})
}
}

func validateCaptures(expectedCaptures ...string) (func(pattern *gofrogcmd.CmdOutputPattern) (string, error), *[]string) {
captures := []string{}
aggFunc := func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
captured := extractFileNameFromRegexCaptureGroup(pattern)
for _, expectedCapture := range expectedCaptures {
if expectedCapture == captured {
captures = append(captures, expectedCapture)
}
}
return pattern.Line, nil
}
return aggFunc, &captures
}

func runDummyTextStream(t *testing.T, txt string, parsers []*gofrogcmd.CmdOutputPattern) {
// tokenize the text to be represented line by line to simulate expected cmd log output
lines := strings.Split(txt, "\n")
// iterate over the lines to simulate line text stream
for _, line := range lines {
for _, parser := range parsers {
// check if the line matches the regexp of the parser
if parser.RegExp.MatchString(line) {
parser.MatchedResults = parser.RegExp.FindStringSubmatch(line)
parser.Line = line
// execute the parser function
_, scannerError := parser.ExecFunc(parser)
assert.NoError(t, scannerError)
}
}
}
}

0 comments on commit 31d3204

Please sign in to comment.