From 8e43355cf780eeee9d1861ee4d81a4fa42111189 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Fri, 26 May 2017 14:41:06 -0500 Subject: [PATCH 1/3] fixing GSU and prefetching, shortening tests, adding ocl tests --- Jenkinsfile | 23 +- Tensile/BenchmarkStructs.py | 170 ++++----- Tensile/Common.py | 21 +- Tensile/Configs/tensor_contraction.yaml | 42 ++- Tensile/Configs/test_hgemm.yaml | 333 ++++++++++++++++++ .../Configs/test_hgemm_scalar_branches.yaml | 137 ------- .../test_hgemm_scalar_load_patterns.yaml | 137 ------- .../Configs/test_hgemm_scalar_tile_sizes.yaml | 190 ---------- Tensile/Configs/test_hgemm_vectors.yaml | 333 ++++++++++++++++++ Tensile/Configs/test_sgemm.yaml | 333 ++++++++++++++++++ .../Configs/test_sgemm_scalar_branches.yaml | 141 -------- .../test_sgemm_scalar_load_patterns.yaml | 133 ------- .../Configs/test_sgemm_scalar_tile_sizes.yaml | 190 ---------- .../Configs/test_sgemm_vector_branches.yaml | 141 -------- .../test_sgemm_vector_load_patterns.yaml | 137 ------- .../Configs/test_sgemm_vector_tile_sizes.yaml | 190 ---------- Tensile/Configs/test_sgemm_vectors.yaml | 333 ++++++++++++++++++ Tensile/KernelWriter.py | 7 +- Tensile/KernelWriterSource.py | 33 +- Tensile/SolutionStructs.py | 11 +- Tensile/SolutionWriter.py | 12 +- Tensile/Tensile.py | 17 +- 22 files changed, 1538 insertions(+), 1526 deletions(-) create mode 100644 Tensile/Configs/test_hgemm.yaml delete mode 100644 Tensile/Configs/test_hgemm_scalar_branches.yaml delete mode 100644 Tensile/Configs/test_hgemm_scalar_load_patterns.yaml delete mode 100644 Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml create mode 100644 Tensile/Configs/test_hgemm_vectors.yaml create mode 100644 Tensile/Configs/test_sgemm.yaml delete mode 100644 Tensile/Configs/test_sgemm_scalar_branches.yaml delete mode 100644 Tensile/Configs/test_sgemm_scalar_load_patterns.yaml delete mode 100644 Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml delete mode 100644 Tensile/Configs/test_sgemm_vector_branches.yaml delete mode 100644 Tensile/Configs/test_sgemm_vector_load_patterns.yaml delete mode 100644 Tensile/Configs/test_sgemm_vector_tile_sizes.yaml create mode 100644 Tensile/Configs/test_sgemm_vectors.yaml diff --git a/Jenkinsfile b/Jenkinsfile index b8ef8f9d3..2a5108110 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -66,17 +66,24 @@ parallel rocm_fiji: { dir("${build_dir_release}") { stage("unit tests") { sh "tensile ../../Tensile/Configs/test_hgemm_defaults.yaml hgemm_defaults" - sh "tensile ../../Tensile/Configs/test_hgemm_scalar_load_patterns.yaml hgemm_scalar_load_patterns" - sh "tensile ../../Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml hgemm_scalar_tile_sizes" - sh "tensile ../../Tensile/Configs/test_hgemm_scalar_branches.yaml hgemm_scalar_branches" sh "tensile ../../Tensile/Configs/test_sgemm_defaults.yaml sgemm_defaults" - sh "tensile ../../Tensile/Configs/test_sgemm_scalar_load_patterns.yaml sgemm_scalar_load_patterns" - sh "tensile ../../Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml sgemm_scalar_tile_sizes" - sh "tensile ../../Tensile/Configs/test_sgemm_scalar_branches.yaml sgemm_scalar_branches" + sh "tensile ../../Tensile/Configs/test_dgemm_defaults.yaml dgemm_defaults" + + sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_hgemm.yaml hgemm" + sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_sgemm.yaml sgemm" + + sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_hgemm_vectors.yaml hgemm_vectors" + sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_sgemm_vectors.yaml sgemm_vectors" + + //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_load_patterns.yaml hgemm_scalar_load_patterns" + //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml hgemm_scalar_tile_sizes" + //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_branches.yaml hgemm_scalar_branches" + //sh "tensile ../../Tensile/Configs/test_sgemm_scalar_load_patterns.yaml sgemm_scalar_load_patterns" + //sh "tensile ../../Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml sgemm_scalar_tile_sizes" + //sh "tensile ../../Tensile/Configs/test_sgemm_scalar_branches.yaml sgemm_scalar_branches" //sh "tensile ../../Tensile/Configs/test_sgemm_vector_load_patterns.yaml sgemm_vector_load_patterns" //sh "tensile ../../Tensile/Configs/test_sgemm_vector_tile_sizes.yaml sgemm_vector_tile_sizes" - sh "tensile ../../Tensile/Configs/test_sgemm_vector_branches.yaml sgemm_vector_branches" - sh "tensile ../../Tensile/Configs/test_dgemm_defaults.yaml dgemm_defaults" + //sh "tensile ../../Tensile/Configs/test_sgemm_vector_branches.yaml sgemm_vector_branches" } } } diff --git a/Tensile/BenchmarkStructs.py b/Tensile/BenchmarkStructs.py index 30eabd1c3..ff52b67be 100644 --- a/Tensile/BenchmarkStructs.py +++ b/Tensile/BenchmarkStructs.py @@ -365,7 +365,7 @@ def convertParametersToSteps(self): self.forkHardcodedParameters(forkPermutations) ############################################################################ - # (II-3) benchmark common parameters + # (II-3) benchmark fork parameters print2("") print2("####################################################################") print1("# Benchmark Fork Parameters") @@ -380,92 +380,92 @@ def convertParametersToSteps(self): print1("# Join Parameters") macroTileJoinSet = set() totalPermutations = 1 - for joinName in self.joinParameters: - # joining a parameter with only a single value - if hasParam(joinName, self.singleValueParameters): - pass - - elif hasParam(joinName, self.forkParameters): - # count permutations - for param in self.forkParameters: - for name in param: # only 1 - if name == joinName: - values = param[name] - localPermutations = len(values) - print2("JoinParameter %s has %u possibilities" % (joinName, localPermutations)) - totalPermutations *= localPermutations - - ########################################################################## - # (II-4.2) Join MacroTile - elif joinName == "MacroTile": - print2("JoinParam: MacroTile") - # get possible WorkGroupEdges from forked - print2("currentForkParameters = %s" % str(self.forkParameters)) - threadTileValues = [] - workGroupValues = [] - # todo having MacroTile as join parameter causes trouble if - # one parameter is benchmarked rather than forked - # however, this may still be the right way to do it - - # count permutations - for paramList in [self.benchmarkCommonParameters, \ - self.forkParameters, self.benchmarkForkParameters, \ - self.benchmarkJoinParameters, self.singleValueParameters ]: - if hasParam("ThreadTile", paramList): - threadTileValues = getParamValues("ThreadTile", paramList) - if hasParam("WorkGroup", paramList): - workGroupValues = getParamValues("WorkGroup", paramList) - macroTilePermutations = len(workGroupValues) * len(threadTileValues) - print2("# Total JoinMacroTile Permutations: %u" % macroTilePermutations) - - # enumerate permutations - for i in range(0, macroTilePermutations): - pIdx = i - workGroupIdx = pIdx % len(workGroupValues) - pIdx /= len(workGroupValues) - threadTileIdx = pIdx % len(threadTileValues) - - workGroup = workGroupValues[workGroupIdx] - threadTile = threadTileValues[threadTileIdx] - - macroTile0 = workGroup[0]*threadTile[0] - macroTile1 = workGroup[1]*threadTile[1] - macroTileJoinSet.add((macroTile0, macroTile1)) - totalPermutations *= len(macroTileJoinSet) - print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) ) - - # invalid join parameter - else: - validJoinNames = ["MacroTile", "DepthU"] - for validParam in self.forkParameters: - for validName in validParam: # only 1 - validJoinNames.append(validName) - printExit("JoinParameter \"%s\" not in %s" % (joinName, validJoinNames) ) - - ############################################################################ - # (II-4.4) Enumerate Permutations Other * MacroTile * DepthU - macroTiles = list(macroTileJoinSet) - print2("# TotalJoinPermutations = %u" % ( totalPermutations) ) - joinPermutations = [] - for i in range(0, totalPermutations): - joinPermutations.append({}) - pIdx = i + if len(self.joinParameters) > 1: for joinName in self.joinParameters: - if hasParam(joinName, self.forkParameters): - for paramDict in self.forkParameters: # hardcodedPermutations - if joinName in paramDict: - paramValues = paramDict[joinName] - valueIdx = pIdx % len(paramValues) - joinPermutations[i][joinName] = paramValues[valueIdx] - pIdx /= len(paramValues) - break + # joining a parameter with only a single value + if hasParam(joinName, self.singleValueParameters): + pass + elif hasParam(joinName, self.forkParameters): + # count permutations + for param in self.forkParameters: + for name in param: # only 1 + if name == joinName: + values = param[name] + localPermutations = len(values) + print2("JoinParameter %s has %u possibilities" % (joinName, localPermutations)) + totalPermutations *= localPermutations + + ########################################################################## + # (II-4.2) Join MacroTile elif joinName == "MacroTile": - valueIdx = pIdx % len(macroTiles) - pIdx /= len(macroTiles) - joinPermutations[i]["MacroTile0"] = macroTiles[valueIdx][0] - joinPermutations[i]["MacroTile1"] = macroTiles[valueIdx][1] - if len(joinPermutations) > 0: - self.joinHardcodedParameters(joinPermutations) + print2("JoinParam: MacroTile") + # get possible WorkGroupEdges from forked + print2("currentForkParameters = %s" % str(self.forkParameters)) + threadTileValues = [] + workGroupValues = [] + # todo having MacroTile as join parameter causes trouble if + # one parameter is benchmarked rather than forked + # however, this may still be the right way to do it + + # count permutations + for paramList in [self.benchmarkCommonParameters, \ + self.forkParameters, self.benchmarkForkParameters, \ + self.benchmarkJoinParameters, self.singleValueParameters ]: + if hasParam("ThreadTile", paramList): + threadTileValues = getParamValues("ThreadTile", paramList) + if hasParam("WorkGroup", paramList): + workGroupValues = getParamValues("WorkGroup", paramList) + macroTilePermutations = len(workGroupValues) * len(threadTileValues) + print2("# Total JoinMacroTile Permutations: %u" % macroTilePermutations) + + # enumerate permutations + for i in range(0, macroTilePermutations): + pIdx = i + workGroupIdx = pIdx % len(workGroupValues) + pIdx /= len(workGroupValues) + threadTileIdx = pIdx % len(threadTileValues) + + workGroup = workGroupValues[workGroupIdx] + threadTile = threadTileValues[threadTileIdx] + + macroTile0 = workGroup[0]*threadTile[0] + macroTile1 = workGroup[1]*threadTile[1] + macroTileJoinSet.add((macroTile0, macroTile1)) + totalPermutations *= len(macroTileJoinSet) + print2("JoinMacroTileSet(%u): %s" % (len(macroTileJoinSet), macroTileJoinSet) ) + + # invalid join parameter + else: + validJoinNames = ["MacroTile"] + for validParam in self.forkParameters: + for validName in validParam: # only 1 + validJoinNames.append(validName) + printExit("JoinParameter \"%s\" not in %s" % (joinName, validJoinNames) ) + + ############################################################################ + # (II-4.4) Enumerate Permutations Other * MacroTile * DepthU + macroTiles = list(macroTileJoinSet) + print2("# TotalJoinPermutations = %u" % ( totalPermutations) ) + joinPermutations = [] + for i in range(0, totalPermutations): + joinPermutations.append({}) + pIdx = i + for joinName in self.joinParameters: + if hasParam(joinName, self.forkParameters): + for paramDict in self.forkParameters: # hardcodedPermutations + if joinName in paramDict: + paramValues = paramDict[joinName] + valueIdx = pIdx % len(paramValues) + joinPermutations[i][joinName] = paramValues[valueIdx] + pIdx /= len(paramValues) + break + elif joinName == "MacroTile": + valueIdx = pIdx % len(macroTiles) + pIdx /= len(macroTiles) + joinPermutations[i]["MacroTile0"] = macroTiles[valueIdx][0] + joinPermutations[i]["MacroTile1"] = macroTiles[valueIdx][1] + if len(joinPermutations) > 0: + self.joinHardcodedParameters(joinPermutations) ############################################################################ diff --git a/Tensile/Common.py b/Tensile/Common.py index f3e71d30e..8f60c8944 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -66,7 +66,7 @@ globalParameters["ForceRedoLibraryLogic"] = True globalParameters["ForceRedoLibraryClient"] = True globalParameters["EnqueuesPerSync"] = 1 -globalParameters["SyncsPerBenchmark"] = 4 +globalParameters["SyncsPerBenchmark"] = 1 globalParameters["PinClocks"] = False globalParameters["KernelTime"] = False @@ -162,21 +162,14 @@ {"NumLoadsCoalescedB": [ 1 ] }, {"WorkGroup": [ [16,16,1]] }, {"WorkGroupMapping": [ 1 ] }, + {"ThreadTile": [ [4,4] ] }, + {"DepthU": [ 16 ] }, ] # benchmark these solution independently -defaultForkParameters = [ - {"ThreadTile": [ [4,4], [4,8], [8,8] ] }, - {"DepthU": [ 4, 8, 16 ] }, - ] -# keep one winner per solution and it affects which will win -defaultBenchmarkForkParameters = [ - ] -# final list of solutions -defaultJoinParameters = [ - "MacroTile" ] -# keep one winner per solution and it would affect which solutions fastest -defaultBenchmarkJoinParameters = [ - ] +defaultForkParameters = [] +defaultBenchmarkForkParameters = [] +defaultJoinParameters = [] +defaultBenchmarkJoinParameters = [] # dictionary of defaults comprised for 1st option for each parameter defaultSolution = {} diff --git a/Tensile/Configs/tensor_contraction.yaml b/Tensile/Configs/tensor_contraction.yaml index c73d513fc..2a3ee7a16 100644 --- a/Tensile/Configs/tensor_contraction.yaml +++ b/Tensile/Configs/tensor_contraction.yaml @@ -47,12 +47,52 @@ BenchmarkProblems: BenchmarkForkParameters: JoinParameters: - MacroTile - - DepthU BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - Range: [ [16, 128], [16, 128], [2, 2, 4], [256] ] + - # 7-D Image Convolution + - # ProblemType + OperationType: TensorContraction + DataType: s + UseBeta: True + NumIndicesC: 4 + IndexAssignmentsA: [6, 5, 0, 1, 4, 3] + IndexAssignmentsB: [6, 5, 1, 4, 2, 3] + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - ProblemSizes: + - Exact: [ 32, 32, 32, 100, 3, 5, 5 ] # caffe layer 1 + - Exact: [ 16, 16, 32, 100, 32, 5, 5 ] # caffe layer 2 + - Exact: [ 8, 8, 64, 100, 32, 5, 5 ] # caffe layer 3 + - EdgeType: ["ShiftPtr"] + - WorkGroupMapping: [ 1 ] + - LoopDoWhile: [False] + - LoopTail: [True] + ForkParameters: + - ThreadTile: + - [ 4, 4 ] + - [ 2, 2 ] + - WorkGroup: + - [ 4, 8, 2 ] + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - DepthU: [2, 4] + BenchmarkForkParameters: + JoinParameters: + - MacroTile + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 32, 32, 32, 100, 3, 5, 5 ] # caffe layer 1 + - Exact: [ 16, 16, 32, 100, 32, 5, 5 ] # caffe layer 2 + - Exact: [ 8, 8, 64, 100, 32, 5, 5 ] # caffe layer 3 + + + LibraryLogic: ScheduleName: Fiji DeviceNames: ["R9 Nano"] diff --git a/Tensile/Configs/test_hgemm.yaml b/Tensile/Configs/test_hgemm.yaml new file mode 100644 index 000000000..f5409b73b --- /dev/null +++ b/Tensile/Configs/test_hgemm.yaml @@ -0,0 +1,333 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 3.0.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + +BenchmarkProblems: + + ############################################################################ + # NN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: False + Batched: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ] + + + ############################################################################ + # NT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + +LibraryLogic: + ScheduleName: Test + DeviceNames: ["Unknown"] +LibraryClient: + diff --git a/Tensile/Configs/test_hgemm_scalar_branches.yaml b/Tensile/Configs/test_hgemm_scalar_branches.yaml deleted file mode 100644 index 883b0127d..000000000 --- a/Tensile/Configs/test_hgemm_scalar_branches.yaml +++ /dev/null @@ -1,137 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [ 8, 8 ] - - [ 4, 8 ] - - [ 4, 4 ] - - WorkGroup: - - [ 8, 16, 1 ] - - [ 8, 16, 2 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [ 8, 8 ] - - [ 4, 8 ] - - [ 4, 4 ] - - WorkGroup: - - [ 8, 16, 1 ] - - [ 8, 16, 2 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [-1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [ 8, 8 ] - - [ 4, 8 ] - - [ 4, 4 ] - - WorkGroup: - - [ 8, 16, 1 ] - - [ 8, 16, 2 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [-1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [ 8, 8 ] - - [ 4, 8 ] - - [ 4, 4 ] - - WorkGroup: - - [ 8, 16, 1 ] - - [ 8, 16, 2 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_hgemm_scalar_load_patterns.yaml b/Tensile/Configs/test_hgemm_scalar_load_patterns.yaml deleted file mode 100644 index c14940ba5..000000000 --- a/Tensile/Configs/test_hgemm_scalar_load_patterns.yaml +++ /dev/null @@ -1,137 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 16, 2 ] - - ThreadTile: - - [ 4, 8 ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 16, 2 ] - - ThreadTile: - - [ 4, 8 ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 16, 2 ] - - ThreadTile: - - [ 4, 8 ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 16, 2 ] - - ThreadTile: - - [ 4, 8 ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml b/Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml deleted file mode 100644 index cf202e4f8..000000000 --- a/Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml +++ /dev/null @@ -1,190 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 4] - - [2, 8] - - [3, 5] - - [4, 2] - - [4, 6] - - [5, 3] - - [5, 7] - - [6, 4] - - [6, 8] - - [7, 5] - - [8, 2] - - [8, 6] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 5] - - [3, 2] - - [3, 6] - - [4, 3] - - [4, 7] - - [5, 4] - - [5, 8] - - [6, 5] - - [7, 2] - - [7, 6] - - [8, 3] - - [8, 7] - - [8, 8] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 2] - - [2, 6] - - [3, 3] - - [3, 7] - - [4, 4] - - [4, 8] - - [5, 5] - - [6, 2] - - [6, 6] - - [7, 3] - - [7, 7] - - [8, 4] - - WorkGroup: - - [ 8, 4, 8 ] - - [ 8, 4, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 16, 8, 2 ] - - [ 16, 8, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 3] - - [2, 7] - - [3, 4] - - [3, 8] - - [4, 5] - - [5, 2] - - [5, 6] - - [6, 3] - - [6, 7] - - [7, 4] - - [7, 8] - - [8, 5] - - WorkGroup: - - [ 8, 4, 8 ] - - [ 8, 4, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 16, 8, 2 ] - - [ 16, 8, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_hgemm_vectors.yaml b/Tensile/Configs/test_hgemm_vectors.yaml new file mode 100644 index 000000000..98f9c4e9e --- /dev/null +++ b/Tensile/Configs/test_hgemm_vectors.yaml @@ -0,0 +1,333 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 3.0.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + +BenchmarkProblems: + + ############################################################################ + # NN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: False + Batched: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ] + + + ############################################################################ + # NT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + +LibraryLogic: + ScheduleName: Test + DeviceNames: ["Unknown"] +LibraryClient: + diff --git a/Tensile/Configs/test_sgemm.yaml b/Tensile/Configs/test_sgemm.yaml new file mode 100644 index 000000000..f9d294262 --- /dev/null +++ b/Tensile/Configs/test_sgemm.yaml @@ -0,0 +1,333 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 3.0.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + +BenchmarkProblems: + + ############################################################################ + # NN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: False + Batched: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ] + + + ############################################################################ + # NT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: False + UseBeta: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + +LibraryLogic: + ScheduleName: Test + DeviceNames: ["Unknown"] +LibraryClient: + diff --git a/Tensile/Configs/test_sgemm_scalar_branches.yaml b/Tensile/Configs/test_sgemm_scalar_branches.yaml deleted file mode 100644 index 2a95f0d0d..000000000 --- a/Tensile/Configs/test_sgemm_scalar_branches.yaml +++ /dev/null @@ -1,141 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 16, 8, 2 ] - - [ 16, 8, 1 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 16, 8, 2 ] - - [ 16, 8, 1 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [-1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 16, 8, 2 ] - - [ 16, 8, 1 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [-1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 16, 8, 2 ] - - [ 16, 8, 1 ] - - [ 16, 16, 1 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_sgemm_scalar_load_patterns.yaml b/Tensile/Configs/test_sgemm_scalar_load_patterns.yaml deleted file mode 100644 index 6ac91b13b..000000000 --- a/Tensile/Configs/test_sgemm_scalar_load_patterns.yaml +++ /dev/null @@ -1,133 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 8 ] - - VectorWidth: [1] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml b/Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml deleted file mode 100644 index de320ae68..000000000 --- a/Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml +++ /dev/null @@ -1,190 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 4] - - [2, 8] - - [3, 5] - - [4, 2] - - [4, 6] - - [5, 3] - - [5, 7] - - [6, 4] - - [6, 8] - - [7, 5] - - [8, 2] - - [8, 6] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 5] - - [3, 2] - - [3, 6] - - [4, 3] - - [4, 7] - - [5, 4] - - [5, 8] - - [6, 5] - - [7, 2] - - [7, 6] - - [8, 3] - - [8, 7] - - [8, 8] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 2] - - [2, 6] - - [3, 3] - - [3, 7] - - [4, 4] - - [4, 8] - - [5, 5] - - [6, 2] - - [6, 6] - - [7, 3] - - [7, 7] - - [8, 4] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - VectorWidth: [1] - ForkParameters: - - ThreadTile: - - [2, 3] - - [2, 7] - - [3, 4] - - [3, 8] - - [4, 5] - - [5, 2] - - [5, 6] - - [6, 3] - - [6, 7] - - [7, 4] - - [7, 8] - - [8, 5] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_sgemm_vector_branches.yaml b/Tensile/Configs/test_sgemm_vector_branches.yaml deleted file mode 100644 index 8016d483f..000000000 --- a/Tensile/Configs/test_sgemm_vector_branches.yaml +++ /dev/null @@ -1,141 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [-1] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["Branch"] - - NumLoadsCoalescedA: [-1] - - NumLoadsCoalescedB: [1] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [4, 8] - - [4, 4] - - WorkGroup: - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_sgemm_vector_load_patterns.yaml b/Tensile/Configs/test_sgemm_vector_load_patterns.yaml deleted file mode 100644 index c49c6bfd4..000000000 --- a/Tensile/Configs/test_sgemm_vector_load_patterns.yaml +++ /dev/null @@ -1,137 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: - - [4, 8] - - DepthU: [ 8 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: - - [4, 8] - - DepthU: [ 8 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: - - [4, 8] - - DepthU: [ 8 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - WorkGroup: - - [ 8, 8, 2 ] - - ThreadTile: - - [4, 8] - - DepthU: [ 8 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - GlobalReadCoalesceVectorA: [False, True] - - GlobalReadCoalesceVectorB: [False, True] - - NumLoadsCoalescedA: [1, -1] - - NumLoadsCoalescedB: [1, -1] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_sgemm_vector_tile_sizes.yaml b/Tensile/Configs/test_sgemm_vector_tile_sizes.yaml deleted file mode 100644 index 281a57e22..000000000 --- a/Tensile/Configs/test_sgemm_vector_tile_sizes.yaml +++ /dev/null @@ -1,190 +0,0 @@ -# test load patterns -GlobalParameters: - MinimumRequiredVersion: 3.0.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - -BenchmarkProblems: - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: False - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - ForkParameters: - - ThreadTile: - - [2, 4] - - [2, 8] - - [3, 5] - - [4, 2] - - [4, 6] - - [5, 3] - - [5, 7] - - [6, 4] - - [6, 8] - - [7, 5] - - [8, 2] - - [8, 6] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - ForkParameters: - - ThreadTile: - - [2, 5] - - [3, 2] - - [3, 6] - - [4, 3] - - [4, 7] - - [5, 4] - - [5, 8] - - [6, 5] - - [7, 2] - - [7, 6] - - [8, 3] - - [8, 7] - - [8, 8] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - ForkParameters: - - ThreadTile: - - [2, 2] - - [2, 6] - - [3, 3] - - [3, 7] - - [4, 4] - - [4, 8] - - [5, 5] - - [6, 2] - - [6, 6] - - [7, 3] - - [7, 7] - - [8, 4] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - - # BenchmarkProblemSizeGroup - Standard - BenchmarkCommonParameters: - - ProblemSizes: - - Range: [ [4, 23, 0, 75], [4, 13, 0, 75], [63, 1, 64] ] - - LoopDoWhile: [False] - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - NumLoadsCoalescedA: [1] - - NumLoadsCoalescedB: [1] - ForkParameters: - - ThreadTile: - - [2, 3] - - [2, 7] - - [3, 4] - - [3, 8] - - [4, 5] - - [5, 2] - - [5, 6] - - [6, 3] - - [6, 7] - - [7, 4] - - [7, 8] - - [8, 5] - - WorkGroup: - - [ 4, 8, 8 ] - - [ 4, 8, 4 ] - - [ 8, 8, 4 ] - - [ 8, 8, 2 ] - - [ 8, 16, 2 ] - - [ 8, 16, 1 ] - - [ 16, 16, 1 ] - - DepthU: [ 16 ] - - VectorWidth: [2, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [128, 128, 256], [128, 128, 256], [64] ] - -LibraryLogic: - ScheduleName: Fiji - DeviceNames: ["R9 Nano"] - -LibraryClient: diff --git a/Tensile/Configs/test_sgemm_vectors.yaml b/Tensile/Configs/test_sgemm_vectors.yaml new file mode 100644 index 000000000..28a00ebc4 --- /dev/null +++ b/Tensile/Configs/test_sgemm_vectors.yaml @@ -0,0 +1,333 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 3.0.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + +BenchmarkProblems: + + ############################################################################ + # NN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: False + Batched: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ] + + + ############################################################################ + # NT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: False + UseBeta: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: False + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + - NumLoadsCoalescedA: [-1] + - NumLoadsCoalescedB: [-1] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - GlobalReadCoalesceVectorA: [False, True] + - GlobalReadCoalesceVectorB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [2, 4] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopDoWhile: [False] + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - NumLoadsCoalescedA: [1, -1] + - NumLoadsCoalescedB: [1, -1] + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [2, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + +LibraryLogic: + ScheduleName: Test + DeviceNames: ["Unknown"] +LibraryClient: + diff --git a/Tensile/KernelWriter.py b/Tensile/KernelWriter.py index 192e31a51..9379dd08c 100644 --- a/Tensile/KernelWriter.py +++ b/Tensile/KernelWriter.py @@ -361,9 +361,9 @@ def kernelBody( self, kernel ): kStr += self.declareLoopNumIter(kernel) # open non-unrolled summation loops + kStr += self.calculateLoopNumIter(kernel, i) for i in range(0,kernel["ProblemType"]["NumIndicesSummation"]-1): kStr += self.comment("summation loop %u"%i) - kStr += self.calculateLoopNumIter(kernel, i) kStr += self.openLoop(kernel, i) #################################### @@ -411,7 +411,6 @@ def kernelBody( self, kernel ): # open unrolled summation loop kStr += self.comment("unrolled summation loop") - kStr += self.calculateLoopNumIter(kernel, self.unrollIdx) kStr += self.openLoop(kernel, self.unrollIdx) # unrolled loop: global read A, B @@ -1517,6 +1516,10 @@ def kernelBodyBetaOnly( self, kernel ): def getSourceFileStringBetaOnly(self, kernel): fileString = "" kernelName = self.getKernelNameBetaOnly(kernel) + if not globalParameters["MergeFiles"]: + fileString += "\n" + fileString += "#include \"%s.h\"\n" % kernelName + fileString += "\n" if self.language == "OCL": fileString += "const char * const %s_src = \"\"\n\"" % kernelName fileString += self.functionSignatureBetaOnly( kernel ) diff --git a/Tensile/KernelWriterSource.py b/Tensile/KernelWriterSource.py index 686fa945f..4ca75ea4a 100644 --- a/Tensile/KernelWriterSource.py +++ b/Tensile/KernelWriterSource.py @@ -377,7 +377,7 @@ def functionPrefix(self, kernel): kStr += "}%s" % (self.endLine) """ kStr += self.endLine - kStr += "__device__ void atomicAddType(%s%sfloat *fPtr, float operand) {%s" \ + kStr += "__device__ inline void atomicAddType(%s%sfloat *fPtr, float operand) {%s" \ % (self.volatileStr, self.globalPtrStr, self.endLine) kStr += " std::atomic *aPtr = reinterpret_cast*>(fPtr);%s" % (self.endLine) kStr += " float oldValue, newValue;%s" % (self.endLine) @@ -401,8 +401,13 @@ def functionPrefix(self, kernel): # real data kStr += "#define TYPE_MAC(MULA,MULB,DST) " \ + "DST = MAC(MULA,MULB,DST);" + self.endLine + # GSU if kernel["GlobalSplitU"] > 1: # 1st kernel will have taken care of B - kStr += "#define TYPE_MAC_WRITE(DST,ALPHA,REG,BETA) atomicAddType(&(DST), (ALPHA)*(REG));" + if kernel["ProblemType"]["UseBeta"]: + kStr += "#define TYPE_MAC_WRITE(DST,ALPHA,REG,BETA) atomicAddType(&(DST), (ALPHA)*(REG));" + else: + kStr += "#define TYPE_MAC_WRITE(DST,ALPHA,REG) atomicAddType(&(DST), (ALPHA)*(REG));" + else: if kernel["ProblemType"]["UseBeta"]: # dst = alpha*reg + dst*beta @@ -1577,7 +1582,7 @@ def calculateLoopNumIter(self, kernel, loopIdx): kStr += "%s numIter%s = 0;%s" \ % (self.indent, self.unrollChar, self.endLine) kStr += "%s}%s" % (self.indent, self.endLine) - #kStr += "if (serial==0) printf(\\\"n:%u\\\\n\\\", numIterK);" + self.endLine + #kStr += "if (serial==0) printf(\\\"WG%u_%u TK:%u\\\\n\\\", get_group_id(0), get_group_id(1), numIterK);" + self.endLine else: kStr += "%snumIter%s = size%s" \ % (self.indent, loopChar, loopChar) @@ -1596,6 +1601,7 @@ def calculateLoopNumIter(self, kernel, loopIdx): kStr += "%s}%s" % (self.indent, self.endLine) kStr += "%snumIter%s = numIterMyWg;%s" \ % (self.indent, self.unrollChar, self.endLine) + #kStr += "if (serial==0) printf(\\\"WG%u_%u UK:%u\\\\n\\\", get_group_id(0), get_group_id(1), numIterK);" + self.endLine return kStr @@ -1619,6 +1625,10 @@ def openLoop(self, kernel, loopIdx): (1 if (kernel["PrefetchGlobalRead"] and loopIdx == self.unrollIdx \ and not tailLoop) else 0), self.endLine) self.indent += " " + #if tailLoop: + # kStr += "if (serial==0) printf(\\\"WG%u_%u: ti=%u\\\\n\\\", get_group_id(0), get_group_id(1), numIterK);" + self.endLine + #else: + # kStr += "if (serial==0) printf(\\\"WG%u_%u: ui=%u\\\\n\\\", get_group_id(0), get_group_id(1), numIterK);" + self.endLine return kStr ############################################################################## @@ -1635,6 +1645,7 @@ def closeLoop(self, kernel, loopIdx): (1 if kernel["PrefetchGlobalRead"] else 0), self.endLine ) else: kStr += "%s}%s" % (self.indent, self.endLine) + #kStr += "if (serial==0) printf(\\\"WG%u_%u: rc0=%.0f\\\\n\\\", get_group_id(0), get_group_id(1), rC[0]);" + self.endLine return kStr ############################################################################## @@ -1654,8 +1665,12 @@ def macIter(self, kernel, black): ############################################################################## def openSumAtLeastUnroll(self, kernel): kStr = "" - kStr += "%sif (size%s >= LOCAL_DEPTHU) {%s" \ - % (self.indent, self.unrollChar, self.endLine) + if kernel["GlobalSplitU"] > 1: + kStr += "%sif (numIterMyWg >= 1) {%s" \ + % (self.indent, self.endLine) + else: + kStr += "%sif (size%s >= LOCAL_DEPTHU) {%s" \ + % (self.indent, self.unrollChar, self.endLine) self.indent += " " return kStr def closeSumAtLeastUnroll(self, kernel): @@ -1750,13 +1765,13 @@ def globalReadDoA(self, kernel, guardK): or self.readUnrollDimComponentsA or guardUnrolledComponents) else "") ) # guard around K if guardK: - kStr += "( globalReadOffsetA%s_%u%s%s%s >= (size%s %% LOCAL_DEPTHU)%s )" \ + kStr += "( globalReadOffsetA%s_%u%s%s >= (size%s %% LOCAL_DEPTHU%s)%s )" \ % (self.unrollChar, \ (perp if kernel["ProblemType"]["TLUA"] else para), \ (("_s%u"%s) if self.readUnrollDimComponentsA else ""), \ - (" - LOCAL_DEPTHU*gsuSumIdx" if kernel["GlobalSplitU"]>1 else ""), \ "+%u"%s if guardUnrolledComponents else "", \ self.unrollChar, \ + (" + LOCAL_DEPTHU*gsuSumIdx" if kernel["GlobalSplitU"]>1 else ""), \ (" || !numIter%s"%self.unrollChar) if kernel["GlobalSplitU"] > 1 else "") # guard around edge if kernel["EdgeType"] == "Branch": @@ -1793,13 +1808,13 @@ def globalReadDoB(self, kernel, guardK): else "") ) # guard around k if guardK: - kStr += "( globalReadOffsetB%s_%u%s%s%s >= (size%s %% LOCAL_DEPTHU)%s )" \ + kStr += "( globalReadOffsetB%s_%u%s%s >= (size%s %% LOCAL_DEPTHU%s)%s )" \ % (self.unrollChar, \ (perp if kernel["ProblemType"]["TLUB"] else para), \ (("_s%u"%s) if self.readUnrollDimComponentsB else ""), \ - (" - LOCAL_DEPTHU*gsuSumIdx" if kernel["GlobalSplitU"]>1 else ""), \ "+%u"%s if guardUnrolledComponents else "", \ self.unrollChar, \ + (" + LOCAL_DEPTHU*gsuSumIdx" if kernel["GlobalSplitU"]>1 else ""), \ (" || !numIter%s"%self.unrollChar) if kernel["GlobalSplitU"] > 1 else "") # guard around edge if kernel["EdgeType"] == "Branch": diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index 22e201644..d80e8a511 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -124,10 +124,9 @@ def isReal(self): def isComplex(self): return not self.isReal() def isDouble(self): - if self.value == self.double or self.value == self.complexDouble: - return True - else: - return False + return self.value == self.double or self.value == self.complexDouble + def isSingle(self): + return self.value == self.single def isHalf(self): return self.value == self.half @@ -756,6 +755,10 @@ def assignDerivedParameters(state): if globalParameters["PrintSolutionRejectionReason"]: print1("GlobalSplitU and LoopTail require SummationAssignmentRoundRobin=True since strongly breaks Tensile kernel architecture") state["Valid"] = False + if not state["ProblemType"]["DataType"].isSingle(): + if globalParameters["PrintSolutionRejectionReason"]: + print1("GlobalSplitU only compatible with single precision") + state["Valid"] = False # how many elements to load diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py index e33a8e129..f7ac139b1 100644 --- a/Tensile/SolutionWriter.py +++ b/Tensile/SolutionWriter.py @@ -299,6 +299,7 @@ def getSourceString(self, solution): else: s += "%sif( inputEvents != NULL )\n" % (t) s += "%s hipEventRecord(inputEvents[0], stream );\n" % (t) + s += "%stry {\n" % (t) if solution["ProblemType"]["UseBeta"]: s += "%sif (betaZero) {\n" % (t) t += " " @@ -336,6 +337,13 @@ def getSourceString(self, solution): for i in range(0, solution["ProblemType"]["NumIndicesC"]): s += "%ssize%s,\n" % (t, self.indexChars[i]) s += "%sbeta);\n" % (t) + s += "%s}\n" % (t) + + s += "%s} catch (const std::exception& e) {\n" % (t) + s += "#ifdef DEBUG\n" + s += "%s std::cerr << e.what() << std::endl;\n" % (t) + s += "#endif\n" + s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) @@ -450,7 +458,7 @@ def getSourceString(self, solution): t = t[2:] s += "%s} catch (const std::exception& e) {\n" % (t) s += "#ifdef DEBUG\n" - s += "#%s std::cerr << e.what() << std::endl;\n" % (t) + s += "%s std::cerr << e.what() << std::endl;\n" % (t) s += "#endif\n" s += "%s return tensileStatusFailure;\n" % (t) s += "%s}\n" % (t) @@ -488,7 +496,7 @@ def getHeaderString(self, solution): if kernel != None: kernelName = self.kernelWriter.getKernelName(kernel) s += "#include \"" + kernelName + ".h\"\n" - for kernel in solution.getKernelsBeta(): + for kernel in solution.getKernelsBetaOnly(): kernelName = self.kernelWriter.getKernelNameBetaOnly(kernel) s += "#include \"" + kernelName + ".h\"\n" diff --git a/Tensile/Tensile.py b/Tensile/Tensile.py index 231b31a82..0966f033e 100644 --- a/Tensile/Tensile.py +++ b/Tensile/Tensile.py @@ -114,9 +114,13 @@ def Tensile(userArgs): argParser.add_argument("--kernel-language", dest="KernelLanguage", \ choices=["HIP", "OCL"], help="override which kernel language to use") argParser.add_argument("-v", "--verbose", action="store_true", \ - help="set PrintLevel=2 and LibraryPrintDebug=True") - argParser.add_argument("--debug", action="store_true", \ - help="set PrintLevel=2, LibraryPrintDebug=True and CMakeBuildType=Debug") + help="set PrintLevel=2") + argParser.add_argument("--debug", dest="debug", action="store_true", \ + help="set PrintLevel=2 and CMakeBuildType=Debug") + argParser.add_argument("--short-names", dest="shortNames", action="store_true", \ + help="use serial kernel and solution names") + argParser.add_argument("--no-merge-files", dest="noMergeFiles", action="store_true", \ + help="kernels and solutions written to individual files") # parse arguments args = argParser.parse_args(userArgs) @@ -148,12 +152,15 @@ def Tensile(userArgs): if args.verbose: print1("# Command-line override: PrintLevel") globalParameters["PrintLevel"] = 2 - globalParameters["LibraryPrintDebug"] = True if args.debug: print1("# Command-line override: Debug") globalParameters["PrintLevel"] = 2 - globalParameters["LibraryPrintDebug"] = True globalParameters["CMakeBuildType"] = "Debug" + if args.shortNames: + globalParameters["ShortNames"] = True + if args.noMergeFiles: + globalParameters["MergeFiles"] = False + print1("") executeStepsInConfig( config ) From 83d56b043f47875c2bdc29fd681fd2afed0076f4 Mon Sep 17 00:00:00 2001 From: David Tanner Date: Tue, 30 May 2017 12:19:28 -0500 Subject: [PATCH 2/3] don't force redo benchmarks still checks individual steps --- Jenkinsfile | 1 - Tensile/BenchmarkProblems.py | 7 +++++++ Tensile/Source/Client.h | 1 + Tensile/Tensile.py | 13 ++----------- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2a5108110..7097eacce 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -72,7 +72,6 @@ parallel rocm_fiji: { sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_hgemm.yaml hgemm" sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_sgemm.yaml sgemm" - sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_hgemm_vectors.yaml hgemm_vectors" sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_sgemm_vectors.yaml sgemm_vectors" //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_load_patterns.yaml hgemm_scalar_load_patterns" diff --git a/Tensile/BenchmarkProblems.py b/Tensile/BenchmarkProblems.py index e5c04c1ae..72e652842 100644 --- a/Tensile/BenchmarkProblems.py +++ b/Tensile/BenchmarkProblems.py @@ -277,6 +277,8 @@ def benchmarkProblemType( problemTypeConfig, problemSizeGroupConfig, \ if process.returncode: printWarning("Benchmark Process exited with code %u" % process.returncode) popWorkingPath() # build + else: + print1("# Already benchmarked; skipping.") ############################################################################ @@ -437,6 +439,9 @@ def __init__(self): # Add Winning Parameters For Hardcoded Parameters def addResults( self, hardcodedParameterList, benchmarkPermutations, \ solutions, results): + if globalParameters["PrintLevel"] >= 1: + print1("# Adding Results to Solution Database") + progressBar = ProgressBar(len(results)) for hardcodedIdx in range(0, len(results)): hardcodedResults = results[hardcodedIdx] hardcodedParameters = hardcodedParameterList[hardcodedIdx] @@ -463,6 +468,8 @@ def addResults( self, hardcodedParameterList, benchmarkPermutations, \ #oldScore = matches[0][2] self.winners[hardcodedParametersKey][0].update(winningParameters) self.winners[hardcodedParametersKey][1] = winningScore + if globalParameters["PrintLevel"] >= 1: + progressBar.increment() ########################################################## diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h index 75eb79947..7136c01f5 100644 --- a/Tensile/Source/Client.h +++ b/Tensile/Source/Client.h @@ -420,6 +420,7 @@ bool benchmarkAllSolutionsForSize( alpha, beta); } + fastestGFlops = 0; for (unsigned int solutionIdx = 0; solutionIdx < numSolutions; solutionIdx ++) { bool solutionIsValid = true; diff --git a/Tensile/Tensile.py b/Tensile/Tensile.py index 0966f033e..1d23675de 100644 --- a/Tensile/Tensile.py +++ b/Tensile/Tensile.py @@ -39,17 +39,8 @@ def executeStepsInConfig( config ): benchmarkDataPath = os.path.join(globalParameters["WorkingPath"], \ globalParameters["BenchmarkDataPath"]) if "BenchmarkProblems" in config: - if os.path.exists(benchmarkDataPath): - resultFiles = os.listdir(benchmarkDataPath) - else: - resultFiles = [] - - if len(resultFiles) < 2* len(config["BenchmarkProblems"]) \ - or globalParameters["ForceRedoBenchmarkProblems"]: - BenchmarkProblems.main( config["BenchmarkProblems"] ) - print1("") - else: - print1("# Benchmarking already done.") + BenchmarkProblems.main( config["BenchmarkProblems"] ) + print1("") ############################################################################## # Library Logic From 7ddfb77c96ba06a15f3034040af2205e4ee5016e Mon Sep 17 00:00:00 2001 From: David Tanner Date: Tue, 30 May 2017 13:15:24 -0500 Subject: [PATCH 3/3] removing opencl from jenkinsfile --- Jenkinsfile | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7097eacce..b0da5ebfc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -72,17 +72,8 @@ parallel rocm_fiji: { sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_hgemm.yaml hgemm" sh "tensile --runtime-language=HIP --kernel-language=HIP ../../Tensile/Configs/test_sgemm.yaml sgemm" - sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_sgemm_vectors.yaml sgemm_vectors" - - //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_load_patterns.yaml hgemm_scalar_load_patterns" - //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_tile_sizes.yaml hgemm_scalar_tile_sizes" - //sh "tensile ../../Tensile/Configs/test_hgemm_scalar_branches.yaml hgemm_scalar_branches" - //sh "tensile ../../Tensile/Configs/test_sgemm_scalar_load_patterns.yaml sgemm_scalar_load_patterns" - //sh "tensile ../../Tensile/Configs/test_sgemm_scalar_tile_sizes.yaml sgemm_scalar_tile_sizes" - //sh "tensile ../../Tensile/Configs/test_sgemm_scalar_branches.yaml sgemm_scalar_branches" - //sh "tensile ../../Tensile/Configs/test_sgemm_vector_load_patterns.yaml sgemm_vector_load_patterns" - //sh "tensile ../../Tensile/Configs/test_sgemm_vector_tile_sizes.yaml sgemm_vector_tile_sizes" - //sh "tensile ../../Tensile/Configs/test_sgemm_vector_branches.yaml sgemm_vector_branches" + // TODO re-enable when jenkins supports opencl + //sh "tensile --runtime-language=OCL --kernel-language=OCL ../../Tensile/Configs/test_sgemm_vectors.yaml sgemm_vectors" } } }