Browse Source

debug

tags/v1.22.8.1^2
lewis 3 years ago
parent
commit
80771f2dd9
2 changed files with 40 additions and 10 deletions
  1. +1
    -1
      models/cloudbrain_temp.go
  2. +39
    -9
      modules/modelarts/modelarts.go

+ 1
- 1
models/cloudbrain_temp.go View File

@@ -47,7 +47,7 @@ func getCloudBrainTemp(temp *CloudbrainTemp) (*CloudbrainTemp, error) {

func GetCloudBrainTempJobs() ([]*CloudbrainTemp, error) {
jobs := make([]*CloudbrainTemp, 0, 10)
return jobs, x.In("status", TempJobStatus, string(ModelArtsStopped)).
return jobs, x.In("status", TempJobStatus, string(ModelArtsStopping), string(ModelArtsTrainJobKilling)).
And("query_times < ?", setting.MaxTempQueryTimes).
Limit(100).
Find(&jobs)


+ 39
- 9
modules/modelarts/modelarts.go View File

@@ -909,6 +909,13 @@ func handleNotebook(temp *models.CloudbrainTemp) error {
log.Error("DelNotebook2 failed:%v", err)
return err
}

temp.Status = string(models.ModelArtsDeleted)
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp failed:%v", err)
return err
}
}
}

@@ -957,12 +964,12 @@ func handleTempNotebook(temp *models.CloudbrainTemp) error {

if isExist {
log.Info("find the record(%s)", temp.JobName)
res, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
_, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
if err != nil {
log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
break
}
temp.Status = res.Status
temp.Status = string(models.ModelArtsStopping)
models.UpdateCloudbrainTemp(temp)
} else {
log.Error("can not find the record(%s) till now", temp.JobName)
@@ -999,7 +1006,7 @@ func handleTrainJob(temp *models.CloudbrainTemp) error {
log.Error("handleTempTrainJob failed:%v", err)
return err
}
} else if temp.Status == string(models.ModelArtsStopping) {
} else if temp.Status == string(models.ModelArtsTrainJobKilling) {
res, err := GetTrainJob(temp.JobID, temp.VersionID)
if err != nil {
log.Error("GetTrainJob failed:%v", err)
@@ -1007,7 +1014,7 @@ func handleTrainJob(temp *models.CloudbrainTemp) error {
}

temp.Status = TransTrainJobStatus(res.IntStatus)
if temp.Status == string(models.ModelArtsStopped) {
if temp.Status == string(models.ModelArtsTrainJobKilled) {
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp failed:%v", err)
@@ -1019,6 +1026,13 @@ func handleTrainJob(temp *models.CloudbrainTemp) error {
log.Error("DelTrainJob failed:%v", err)
return err
}

temp.Status = string(models.ModelArtsDeleted)
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp failed:%v", err)
return err
}
}
}

@@ -1032,7 +1046,7 @@ func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
log.Error("handleTempTrainJobMultiVersion failed:%v", err)
return err
}
} else if temp.Status == string(models.ModelArtsStopping) {
} else if temp.Status == string(models.ModelArtsTrainJobKilling) {
res, err := GetTrainJob(temp.JobID, temp.VersionID)
if err != nil {
log.Error("GetTrainJob failed:%v", err)
@@ -1040,7 +1054,7 @@ func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
}

temp.Status = TransTrainJobStatus(res.IntStatus)
if temp.Status == string(models.ModelArtsStopped) {
if temp.Status == string(models.ModelArtsTrainJobKilled) {
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp failed:%v", err)
@@ -1052,6 +1066,13 @@ func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
log.Error("DelTrainJob failed:%v", err)
return err
}

temp.Status = string(models.ModelArtsDeleted)
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp failed:%v", err)
return err
}
}

}
@@ -1077,7 +1098,6 @@ func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
}

if result != nil {
//todo: check find
count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
if result.VersionCount == int64(count+1) {
log.Info("find the record(%s)", temp.JobName)
@@ -1091,12 +1111,16 @@ func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
log.Error("StopTrainJob failed:%v", err)
break
}
temp.Status = string(models.ModelArtsStopping)
temp.Status = string(models.ModelArtsTrainJobKilling)
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
break
}
} else {
log.Error("can not find the record(%s) till now", temp.JobName)
err = errors.New("not found")
break
}
}

@@ -1150,7 +1174,7 @@ func handleTempTrainJob(temp *models.CloudbrainTemp) error {
break
}

temp.Status = string(models.ModelArtsStopping)
temp.Status = string(models.ModelArtsTrainJobKilling)
err = models.UpdateCloudbrainTemp(temp)
if err != nil {
log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
@@ -1158,6 +1182,12 @@ func handleTempTrainJob(temp *models.CloudbrainTemp) error {
}
}
}

if !isExist {
log.Error("can not find the record(%s) till now", temp.JobName)
err = errors.New("not found")
break
}
}

break


Loading…
Cancel
Save