Skip to content

Commit

Permalink
Merge pull request #96 from guacamoleo/develop
Browse files Browse the repository at this point in the history
fixing GSU with prefetching
  • Loading branch information
guacamoleo authored May 30, 2017
2 parents 7a07945 + 7ddfb77 commit 2d1d9ff
Show file tree
Hide file tree
Showing 24 changed files with 1,539 additions and 1,538 deletions.
15 changes: 6 additions & 9 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,14 @@ parallel rocm_fiji: {
dir("${build_dir_release}") {
stage("unit tests") {
sh "tensile ../../Tensile/Configs/test_hgemm_defaults.yaml hgemm_defaults"
sh "tensile ../../Tensile/Configs/test_hgemm_scalar_load_patterns.yaml hgemm_scalar_load_patterns"
sh "tensile ../../Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml hgemm_scalar_tile_sizes"
sh "tensile ../../Tensile/Configs/test_hgemm_scalar_branches.yaml hgemm_scalar_branches"
sh "tensile ../../Tensile/Configs/test_sgemm_defaults.yaml sgemm_defaults"
sh "tensile ../../Tensile/Configs/test_sgemm_scalar_load_patterns.yaml sgemm_scalar_load_patterns"
sh "tensile ../../Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml sgemm_scalar_tile_sizes"
sh "tensile ../../Tensile/Configs/test_sgemm_scalar_branches.yaml sgemm_scalar_branches"
//sh "tensile ../../Tensile/Configs/test_sgemm_vector_load_patterns.yaml sgemm_vector_load_patterns"
//sh "tensile ../../Tensile/Configs/test_sgemm_vector_tile_sizes.yaml sgemm_vector_tile_sizes"
sh "tensile ../../Tensile/Configs/test_sgemm_vector_branches.yaml sgemm_vector_branches"
sh "tensile ../../Tensile/Configs/test_dgemm_defaults.yaml dgemm_defaults"

sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_hgemm.yaml hgemm"
sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_sgemm.yaml sgemm"

// TODO re-enable when jenkins supports opencl
//sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_sgemm_vectors.yaml sgemm_vectors"
}
}
}
Expand Down
7 changes: 7 additions & 0 deletions Tensile/BenchmarkProblems.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,8 @@ def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \
if process.returncode:
printWarning("Benchmark Process exited with code %u" % process.returncode)
popWorkingPath() # build
else:
print1("# Already benchmarked; skipping.")


############################################################################
Expand Down Expand Up @@ -437,6 +439,9 @@ def __init__(self):
# Add Winning Parameters For Hardcoded Parameters
def addResults( self, hardcodedParameterList, benchmarkPermutations, \
solutions, results):
if globalParameters["PrintLevel"] >= 1:
print1("# Adding Results to Solution Database")
progressBar = ProgressBar(len(results))
for hardcodedIdx in range(0, len(results)):
hardcodedResults = results[hardcodedIdx]
hardcodedParameters = hardcodedParameterList[hardcodedIdx]
Expand All @@ -463,6 +468,8 @@ def addResults( self, hardcodedParameterList, benchmarkPermutations, \
#oldScore = matches[0][2]
self.winners[hardcodedParametersKey][0].update(winningParameters)
self.winners[hardcodedParametersKey][1] = winningScore
if globalParameters["PrintLevel"] >= 1:
progressBar.increment()


##########################################################
Expand Down
170 changes: 85 additions & 85 deletions Tensile/BenchmarkStructs.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def convertParametersToSteps(self):
self.forkHardcodedParameters(forkPermutations)

############################################################################
# (II-3) benchmark common parameters
# (II-3) benchmark fork parameters
print2("")
print2("####################################################################")
print1("# Benchmark Fork Parameters")
Expand All @@ -380,92 +380,92 @@ def convertParametersToSteps(self):
print1("# Join Parameters")
macroTileJoinSet = set()
totalPermutations = 1
for joinName in self.joinParameters:
# joining a parameter with only a single value
if hasParam(joinName, self.singleValueParameters):
pass

elif hasParam(joinName, self.forkParameters):
# count permutations
for param in self.forkParameters:
for name in param: # only 1
if name == joinName:
values = param[name]
localPermutations = len(values)
print2("JoinParameter %s has %u possibilities" % (joinName, localPermutations))
totalPermutations *= localPermutations

##########################################################################
# (II-4.2) Join MacroTile
elif joinName == "MacroTile":
print2("JoinParam: MacroTile")
# get possible WorkGroupEdges from forked
print2("currentForkParameters = %s" % str(self.forkParameters))
threadTileValues = []
workGroupValues = []
# todo having MacroTile as join parameter causes trouble if
# one parameter is benchmarked rather than forked
# however, this may still be the right way to do it

# count permutations
for paramList in [self.benchmarkCommonParameters, \
self.forkParameters, self.benchmarkForkParameters, \
self.benchmarkJoinParameters, self.singleValueParameters ]:
if hasParam("ThreadTile", paramList):
threadTileValues = getParamValues("ThreadTile", paramList)
if hasParam("WorkGroup", paramList):
workGroupValues = getParamValues("WorkGroup", paramList)
macroTilePermutations = len(workGroupValues) * len(threadTileValues)
print2("# Total JoinMacroTile Permutations: %u" % macroTilePermutations)

# enumerate permutations
for i in range(0, macroTilePermutations):
pIdx = i
workGroupIdx = pIdx % len(workGroupValues)
pIdx /= len(workGroupValues)
threadTileIdx = pIdx % len(threadTileValues)

workGroup = workGroupValues[workGroupIdx]
threadTile = threadTileValues[threadTileIdx]

macroTile0 = workGroup[0]*threadTile[0]
macroTile1 = workGroup[1]*threadTile[1]
macroTileJoinSet.add((macroTile0, macroTile1))
totalPermutations *= len(macroTileJoinSet)
print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) )

# invalid join parameter
else:
validJoinNames = ["MacroTile", "DepthU"]
for validParam in self.forkParameters:
for validName in validParam: # only 1
validJoinNames.append(validName)
printExit("JoinParameter \"%s\" not in %s" % (joinName, validJoinNames) )

############################################################################
# (II-4.4) Enumerate Permutations Other * MacroTile * DepthU
macroTiles = list(macroTileJoinSet)
print2("# TotalJoinPermutations = %u" % ( totalPermutations) )
joinPermutations = []
for i in range(0, totalPermutations):
joinPermutations.append({})
pIdx = i
if len(self.joinParameters) > 1:
for joinName in self.joinParameters:
if hasParam(joinName, self.forkParameters):
for paramDict in self.forkParameters: # hardcodedPermutations
if joinName in paramDict:
paramValues = paramDict[joinName]
valueIdx = pIdx % len(paramValues)
joinPermutations[i][joinName] = paramValues[valueIdx]
pIdx /= len(paramValues)
break
# joining a parameter with only a single value
if hasParam(joinName, self.singleValueParameters):
pass
elif hasParam(joinName, self.forkParameters):
# count permutations
for param in self.forkParameters:
for name in param: # only 1
if name == joinName:
values = param[name]
localPermutations = len(values)
print2("JoinParameter %s has %u possibilities" % (joinName, localPermutations))
totalPermutations *= localPermutations

##########################################################################
# (II-4.2) Join MacroTile
elif joinName == "MacroTile":
valueIdx = pIdx % len(macroTiles)
pIdx /= len(macroTiles)
joinPermutations[i]["MacroTile0"] = macroTiles[valueIdx][0]
joinPermutations[i]["MacroTile1"] = macroTiles[valueIdx][1]
if len(joinPermutations) > 0:
self.joinHardcodedParameters(joinPermutations)
print2("JoinParam: MacroTile")
# get possible WorkGroupEdges from forked
print2("currentForkParameters = %s" % str(self.forkParameters))
threadTileValues = []
workGroupValues = []
# todo having MacroTile as join parameter causes trouble if
# one parameter is benchmarked rather than forked
# however, this may still be the right way to do it

# count permutations
for paramList in [self.benchmarkCommonParameters, \
self.forkParameters, self.benchmarkForkParameters, \
self.benchmarkJoinParameters, self.singleValueParameters ]:
if hasParam("ThreadTile", paramList):
threadTileValues = getParamValues("ThreadTile", paramList)
if hasParam("WorkGroup", paramList):
workGroupValues = getParamValues("WorkGroup", paramList)
macroTilePermutations = len(workGroupValues) * len(threadTileValues)
print2("# Total JoinMacroTile Permutations: %u" % macroTilePermutations)

# enumerate permutations
for i in range(0, macroTilePermutations):
pIdx = i
workGroupIdx = pIdx % len(workGroupValues)
pIdx /= len(workGroupValues)
threadTileIdx = pIdx % len(threadTileValues)

workGroup = workGroupValues[workGroupIdx]
threadTile = threadTileValues[threadTileIdx]

macroTile0 = workGroup[0]*threadTile[0]
macroTile1 = workGroup[1]*threadTile[1]
macroTileJoinSet.add((macroTile0, macroTile1))
totalPermutations *= len(macroTileJoinSet)
print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) )

# invalid join parameter
else:
validJoinNames = ["MacroTile"]
for validParam in self.forkParameters:
for validName in validParam: # only 1
validJoinNames.append(validName)
printExit("JoinParameter \"%s\" not in %s" % (joinName, validJoinNames) )

############################################################################
# (II-4.4) Enumerate Permutations Other * MacroTile * DepthU
macroTiles = list(macroTileJoinSet)
print2("# TotalJoinPermutations = %u" % ( totalPermutations) )
joinPermutations = []
for i in range(0, totalPermutations):
joinPermutations.append({})
pIdx = i
for joinName in self.joinParameters:
if hasParam(joinName, self.forkParameters):
for paramDict in self.forkParameters: # hardcodedPermutations
if joinName in paramDict:
paramValues = paramDict[joinName]
valueIdx = pIdx % len(paramValues)
joinPermutations[i][joinName] = paramValues[valueIdx]
pIdx /= len(paramValues)
break
elif joinName == "MacroTile":
valueIdx = pIdx % len(macroTiles)
pIdx /= len(macroTiles)
joinPermutations[i]["MacroTile0"] = macroTiles[valueIdx][0]
joinPermutations[i]["MacroTile1"] = macroTiles[valueIdx][1]
if len(joinPermutations) > 0:
self.joinHardcodedParameters(joinPermutations)


############################################################################
Expand Down
21 changes: 7 additions & 14 deletions Tensile/Common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
globalParameters["ForceRedoLibraryLogic"] = True
globalParameters["ForceRedoLibraryClient"] = True
globalParameters["EnqueuesPerSync"] = 1
globalParameters["SyncsPerBenchmark"] = 4
globalParameters["SyncsPerBenchmark"] = 1
globalParameters["PinClocks"] = False
globalParameters["KernelTime"] = False

Expand Down Expand Up @@ -162,21 +162,14 @@
{"NumLoadsCoalescedB": [ 1 ] },
{"WorkGroup": [ [16,16,1]] },
{"WorkGroupMapping": [ 1 ] },
{"ThreadTile": [ [4,4] ] },
{"DepthU": [ 16 ] },
]
# benchmark these solution independently
defaultForkParameters = [
{"ThreadTile": [ [4,4], [4,8], [8,8] ] },
{"DepthU": [ 4, 8, 16 ] },
]
# keep one winner per solution and it affects which will win
defaultBenchmarkForkParameters = [
]
# final list of solutions
defaultJoinParameters = [
"MacroTile" ]
# keep one winner per solution and it would affect which solutions fastest
defaultBenchmarkJoinParameters = [
]
defaultForkParameters = []
defaultBenchmarkForkParameters = []
defaultJoinParameters = []
defaultBenchmarkJoinParameters = []

# dictionary of defaults comprised for 1st option for each parameter
defaultSolution = {}
Expand Down
42 changes: 41 additions & 1 deletion Tensile/Configs/tensor_contraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,52 @@ BenchmarkProblems:
BenchmarkForkParameters:
JoinParameters:
- MacroTile
- DepthU
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [16, 128], [16, 128], [2, 2, 4], [256] ]

- # 7-D Image Convolution
- # ProblemType
OperationType: TensorContraction
DataType: s
UseBeta: True
NumIndicesC: 4
IndexAssignmentsA: [6, 5, 0, 1, 4, 3]
IndexAssignmentsB: [6, 5, 1, 4, 2, 3]

- # BenchmarkProblemSizeGroup - Standard
InitialSolutionParameters:
BenchmarkCommonParameters:
- ProblemSizes:
- Exact: [ 32, 32, 32, 100, 3, 5, 5 ] # caffe layer 1
- Exact: [ 16, 16, 32, 100, 32, 5, 5 ] # caffe layer 2
- Exact: [ 8, 8, 64, 100, 32, 5, 5 ] # caffe layer 3
- EdgeType: ["ShiftPtr"]
- WorkGroupMapping: [ 1 ]
- LoopDoWhile: [False]
- LoopTail: [True]
ForkParameters:
- ThreadTile:
- [ 4, 4 ]
- [ 2, 2 ]
- WorkGroup:
- [ 4, 8, 2 ]
- [ 8, 8, 1 ]
- [ 16, 16, 1 ]
- DepthU: [2, 4]
BenchmarkForkParameters:
JoinParameters:
- MacroTile
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Exact: [ 32, 32, 32, 100, 3, 5, 5 ] # caffe layer 1
- Exact: [ 16, 16, 32, 100, 32, 5, 5 ] # caffe layer 2
- Exact: [ 8, 8, 64, 100, 32, 5, 5 ] # caffe layer 3



LibraryLogic:
ScheduleName: Fiji
DeviceNames: ["R9 Nano"]
Expand Down
Loading

0 comments on commit 2d1d9ff

Please sign in to comment.