Skip to content

Commit

Permalink
fix bug: allocated gpus is 0 when using arena top node (#628)
Browse files Browse the repository at this point in the history
  • Loading branch information
happy2048 authored Aug 5, 2021
1 parent e51b97e commit 70486e5
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 20 deletions.
23 changes: 11 additions & 12 deletions pkg/apis/utils/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"sort"
"strconv"
"strings"
"time"

Expand All @@ -28,21 +27,21 @@ func CPUCountInPod(pod *v1.Pod) float64 {
}

func GPUCountInPod(pod *v1.Pod) int {
total := 0
total := int64(0)
for _, count := range ResourceInContainers(pod, types.NvidiaGPUResourceName) {
c, _ := parseInt(count)
c := count.(int64)
total += c
}
return total
return int(total)
}

func AliyunGPUCountInPod(pod *v1.Pod) int {
total := 0
total := int64(0)
for _, count := range ResourceInContainers(pod, types.AliyunGPUResourceName) {
c, _ := parseInt(count)
c := count.(int64)
total += c
}
return total
return int(total)
}

func ResourceInContainers(pod *v1.Pod, resourceName string) map[int]interface{} {
Expand Down Expand Up @@ -143,12 +142,12 @@ func DefinePodPhaseStatus(pod v1.Pod) (string, int, int, int) {
}

func GPUMemoryCountInPod(pod *v1.Pod) int {
total := 0
total := int64(0)
for _, count := range ResourceInContainers(pod, types.GPUShareResourceName) {
c, _ := parseInt(count)
c := count.(int64)
total += c
}
return total
return int(total)
}

func GetContainerAllocation(pod *v1.Pod) map[int]map[string]int {
Expand Down Expand Up @@ -316,9 +315,9 @@ func GetDurationOfPod(pod *v1.Pod) time.Duration {
}

func parseInt(i interface{}) (int, error) {
s, ok := i.(string)
s, ok := i.(int64)
if !ok {
return 0, errors.New("invalid value")
}
return strconv.Atoi(s)
return int(s), nil
}
5 changes: 5 additions & 0 deletions pkg/podlogs/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"strings"

"github.com/kubeflow/arena/pkg/apis/config"
"github.com/kubeflow/arena/pkg/apis/types"
Expand Down Expand Up @@ -96,6 +97,10 @@ func (p *PodLogger) ensureContainerStarted() error {
log.Debugf("pod:%s,pod phase: %v\n", p.InstanceName, pod.Status.Phase)
log.Debugf("pod print status: %s\n", status)
switch podPhase := pod.Status.Phase; {
case podPhase == v1.PodPending && strings.Index(status, "Init:") == 0:
return nil
case podPhase == v1.PodPending && strings.Index(status, "PodInitializing") == 0:
return nil
case podPhase == v1.PodRunning && status != "Unknown":
return nil
case podPhase == v1.PodFailed || podPhase == v1.PodSucceeded:
Expand Down
14 changes: 6 additions & 8 deletions pkg/training/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,19 @@ func AcceptJobLog(jobName string, trainingType types.TrainingJobType, args *type
podStatuses[pod.Name] = status
}
// 4.if the instance name is invalid,return error
_, ok := podStatuses[args.InstanceName]
status, ok := podStatuses[args.InstanceName]
if !ok {
return fmt.Errorf("invalid instance name %v in job %v,please use 'arena get %v' to make sure instance name.",
args.InstanceName,
jobName,
jobName,
)
}
// 5.if the instance status is not running,return error
//if status != "Running" {
// return fmt.Errorf("failed to get logs of instance %v,because it is not running,please use 'arena get %v' to make sure instance status",
// args.InstanceName,
// jobName,
// )
//}
if strings.Index(status, "Init:") == 0 || strings.Index(status, "PodInitializing") == 0 {
if args.ContainerName == "" {
args.ContainerName = "init-code"
}
}
logger := podlogs.NewPodLogger(args)
_, err = logger.AcceptLogs()
return err
Expand Down

0 comments on commit 70486e5

Please sign in to comment.