From 4add14ec35ddbdc6b5330625231513b8daed2c18 Mon Sep 17 00:00:00 2001 From: Vanessasaurus <814322+vsoch@users.noreply.github.com> Date: Thu, 10 Oct 2024 18:16:53 -0700 Subject: [PATCH] Fix image libintern (#20) * bug: name is removed from response Problem: This is the same issue that hit fluence - Name was removed from the Match response. Solution: Remove it from here, since it is not largely used. Signed-off-by: vsoch --- .github/test.sh | 4 ++-- README.md | 2 +- src/Makefile | 4 ++-- src/build/scheduler/Dockerfile | 14 +++----------- src/fluxnetes/go.mod | 2 +- src/fluxnetes/go.sum | 4 ++-- src/fluxnetes/pkg/jgf/jgf.go | 8 +++----- src/fluxnetes/pkg/jgf/jgf_test.go | 8 ++++---- src/fluxnetes/pkg/jgf/types.go | 2 +- src/fluxnetes/pkg/utils/utils.go | 8 +++----- 10 files changed, 22 insertions(+), 34 deletions(-) diff --git a/.github/test.sh b/.github/test.sh index 2157ce1..5af3a7c 100644 --- a/.github/test.sh +++ b/.github/test.sh @@ -47,7 +47,7 @@ fluxnetes_scheduler=$(kubectl get pods --selector=job-name=job -o json | jq -r . echo echo "Fluxnetes job pod is ${fluxnetes_job_pod}" -sleep 10 +sleep 30 # Shared function to check output function check_output { @@ -75,4 +75,4 @@ check_output 'check-scheduled-by' "${fluxnetes_scheduler}" "fluxnetes" # But events tell us actually what happened, let's parse throught them and find our pods # This tells us the Event -> reason "Scheduled" and who it was reported by. reported_by=$(kubectl events --for pod/${fluxnetes_job_pod} -o json | jq -c '[ .items[] | select( .reason | contains("Scheduled")) ]' | jq -r .[0].reportingComponent) -check_output 'reported-by-fluxnetes' "${reported_by}" "fluxnetes" \ No newline at end of file +check_output 'reported-by-fluxnetes' "${reported_by}" "fluxnetes" diff --git a/README.md b/README.md index 9151cfa..b4b804b 100644 --- a/README.md +++ b/README.md @@ -172,11 +172,11 @@ SELECT group_name, group_size from pods_provisional; ### TODO +- [ ] kubectl plugin to get fluxion state? - [ ] Figure out how In-tree registry plugins (that are related to resources) should be run to inform fluxion - we likely want to move assume pod outside of that schedule function, or ensure pod passed matches. - [ ] Optimize queries. - [ ] Restarting with postgres shouldn't have crashloopbackoff when the database isn't ready yet -- [ ] need to cancel reservations and clear table at end of cycle - [ ] The queue should inherit (and return) the start time (when the pod was first seen) "start" in scheduler.go - Testing: - [ ] need to test duration / completion time works (run job with short duration, should be cancelled/cleaned up) diff --git a/src/Makefile b/src/Makefile index 3a9a5d1..ce89d8f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,7 +4,7 @@ INSTALL_PREFIX ?= /usr LIB_PREFIX ?= /usr/lib LOCALBIN ?= $(shell pwd)/bin COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) -BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lboost_system -lflux-hostlist -lboost_graph -lyaml-cpp" +BUILDENVVAR=CGO_CFLAGS="-I${FLUX_SCHED_ROOT} -I${FLUX_SCHED_ROOT}/resource/reapi/bindings/c" CGO_LDFLAGS="-L${LIB_PREFIX} -L${LIB_PREFIX}/flux -L${FLUX_SCHED_ROOT}/resource/reapi/bindings -lreapi_cli -lflux-idset -lstdc++ -ljansson -lhwloc -lflux-hostlist -lboost_graph -lyaml-cpp" LOCAL_REGISTRY=localhost:5000 @@ -35,4 +35,4 @@ protoc: $(LOCALBIN) .PHONY: proto proto: protoc PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluxnetes/pkg/fluxion-grpc/fluxion.proto - PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluxnetes/pkg/service-grpc/service.proto \ No newline at end of file + PATH=$(LOCALBIN):${PATH} protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative fluxnetes/pkg/service-grpc/service.proto diff --git a/src/build/scheduler/Dockerfile b/src/build/scheduler/Dockerfile index 7078cde..49c99d4 100644 --- a/src/build/scheduler/Dockerfile +++ b/src/build/scheduler/Dockerfile @@ -31,19 +31,11 @@ RUN go mod tidy && \ make server FLUX_SCHED_ROOT=/opt/flux-sched # minimize build! -FROM ubuntu:jammy +FROM fluxrm/flux-sched:jammy COPY --from=builder /go/src/fluxnetes/bin/server /bin/fluxion-service COPY --from=builder /usr/lib/flux/ /usr/lib/flux COPY --from=builder /usr/lib/libflux* /usr/lib/ -RUN apt-get update && apt-get -qq install -y --no-install-recommends \ - libboost-graph-dev \ - libboost-system-dev \ - libboost-filesystem-dev \ - libboost-regex-dev \ - libyaml-cpp-dev \ - libjansson-dev \ - hwloc && \ - apt-get clean && \ - mkdir -p /home/data/jobspecs /home/data/jgf && chmod -R ugo+rwx /home/data +USER root +RUN mkdir -p /home/data/jobspecs /home/data/jgf && chmod -R ugo+rwx /home/data ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/lib/flux \ No newline at end of file diff --git a/src/fluxnetes/go.mod b/src/fluxnetes/go.mod index 6eeb614..782c7da 100644 --- a/src/fluxnetes/go.mod +++ b/src/fluxnetes/go.mod @@ -3,7 +3,7 @@ module github.com/converged-computing/fluxnetes go 1.21 require ( - github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 + github.com/flux-framework/fluxion-go v0.39.0 github.com/stretchr/testify v1.7.0 google.golang.org/grpc v1.38.0 google.golang.org/protobuf v1.26.0 diff --git a/src/fluxnetes/go.sum b/src/fluxnetes/go.sum index c7291ca..40c5ae3 100644 --- a/src/fluxnetes/go.sum +++ b/src/fluxnetes/go.sum @@ -98,8 +98,8 @@ github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d/go.mod h1:ZZM github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2 h1:Yz/vVX0XfB2q51ZLh2p8YI5vphvv0rZF4PqtKPscvsY= -github.com/flux-framework/fluxion-go v0.32.1-0.20240420052153-909523c84ca2/go.mod h1:jA5+kOSLxchFzixzYEvMAGjkXB5yszO/HxUwdhX/5/U= +github.com/flux-framework/fluxion-go v0.39.0 h1:f68CTxHouyOvjfgu5YKYFHQ405vxtdSlG8crPph8+DU= +github.com/flux-framework/fluxion-go v0.39.0/go.mod h1:jA5+kOSLxchFzixzYEvMAGjkXB5yszO/HxUwdhX/5/U= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/form3tech-oss/jwt-go v3.2.3+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= diff --git a/src/fluxnetes/pkg/jgf/jgf.go b/src/fluxnetes/pkg/jgf/jgf.go index 070703a..bdc93f0 100644 --- a/src/fluxnetes/pkg/jgf/jgf.go +++ b/src/fluxnetes/pkg/jgf/jgf.go @@ -97,11 +97,9 @@ func (g *FluxJGF) MakeBidirectionalEdge(parent, child string) { // MakeEdge creates an edge for the JGF func (g *FluxJGF) MakeEdge(source string, target string, contains string) { newedge := edge{ - Source: source, - Target: target, - Metadata: edgeMetadata{ - Name: map[string]string{containmentKey: contains}, - }, + Source: source, + Target: target, + Metadata: edgeMetadata{Subsystem: containmentKey}, } g.Graph.Edges = append(g.Graph.Edges, newedge) } diff --git a/src/fluxnetes/pkg/jgf/jgf_test.go b/src/fluxnetes/pkg/jgf/jgf_test.go index 1d1a596..173bfcc 100644 --- a/src/fluxnetes/pkg/jgf/jgf_test.go +++ b/src/fluxnetes/pkg/jgf/jgf_test.go @@ -44,8 +44,8 @@ func TestNewFluxJGF(t *testing.T) { fmt.Println(out) // Add some nodes! - computeNodeA := fluxgraph.MakeNode("node", subnetNodeA.Metadata.Name, 0) - computeNodeB := fluxgraph.MakeNode("node", subnetNodeB.Metadata.Name, 1) + computeNodeA := fluxgraph.MakeNode("node", subnetNodeA.Metadata.Type, 0) + computeNodeB := fluxgraph.MakeNode("node", subnetNodeB.Metadata.Type, 1) fluxgraph.MakeBidirectionalEdge(subnetNodeA.Id, computeNodeA.Id) fluxgraph.MakeBidirectionalEdge(subnetNodeB.Id, computeNodeB.Id) @@ -56,11 +56,11 @@ func TestNewFluxJGF(t *testing.T) { fmt.Println(out) // Add a GPU to one, and cores to the other - subpath := fmt.Sprintf("%s/%s", subnetNodeA.Metadata.Name, computeNodeA.Metadata.Name) + subpath := fmt.Sprintf("%s/%s", subnetNodeA.Metadata.Type, computeNodeA.Metadata.Type) gpuNodeA := fluxgraph.MakeGPU(NvidiaGPU, subpath, 1, 0) fluxgraph.MakeBidirectionalEdge(computeNodeA.Id, gpuNodeA.Id) - subpath = fmt.Sprintf("%s/%s", subnetNodeB.Metadata.Name, computeNodeB.Metadata.Name) + subpath = fmt.Sprintf("%s/%s", subnetNodeB.Metadata.Type, computeNodeB.Metadata.Type) coreNode := fluxgraph.MakeCore(CoreType, subpath, 0) fluxgraph.MakeBidirectionalEdge(computeNodeB.Id, coreNode.Id) diff --git a/src/fluxnetes/pkg/jgf/types.go b/src/fluxnetes/pkg/jgf/types.go index 8359c28..79f5946 100644 --- a/src/fluxnetes/pkg/jgf/types.go +++ b/src/fluxnetes/pkg/jgf/types.go @@ -33,7 +33,7 @@ type edge struct { } type edgeMetadata struct { - Name map[string]string `json:"name,omitempty"` + Subsystem string `json:"subsystem"` } type nodeMetadata struct { diff --git a/src/fluxnetes/pkg/utils/utils.go b/src/fluxnetes/pkg/utils/utils.go index 2666752..2c35596 100644 --- a/src/fluxnetes/pkg/utils/utils.go +++ b/src/fluxnetes/pkg/utils/utils.go @@ -256,7 +256,6 @@ func computeTotalRequests(podList *corev1.PodList) map[corev1.ResourceName]resou type allocation struct { Type string - Name string Basename string CoreCount int } @@ -290,7 +289,6 @@ func ParseAllocResult(allocated, groupName string) []allocation { if metadata["type"].(string) == jgf.NodeType { result = append(result, allocation{ Type: metadata["type"].(string), - Name: metadata["name"].(string), Basename: metadata["basename"].(string), CoreCount: corecount, }) @@ -301,9 +299,9 @@ func ParseAllocResult(allocated, groupName string) []allocation { } fmt.Printf("Final node result for %s\n", groupName) for i, alloc := range result { - fmt.Printf("Node %d: %s\n", i, alloc.Name) - fmt.Printf(" Type: %s\n Name: %s\n Basename: %s\n CoreCount: %d\n", - alloc.Type, alloc.Name, alloc.Basename, alloc.CoreCount) + fmt.Printf("Node %d: %s\n", i, alloc.Basename) + fmt.Printf(" Type: %s\n Basename: %s\n CoreCount: %d\n", + alloc.Type, alloc.Basename, alloc.CoreCount) } return result