Browse Source

fix-333优化,增加错误重试,支持等待的云脑任务删除

tags/v1.21.12.1
ychao_1983 4 years ago
parent
commit
deaf448668
2 changed files with 25 additions and 4 deletions
  1. +2
    -2
      models/cloudbrain.go
  2. +23
    -2
      routers/repo/cloudbrain.go

+ 2
- 2
models/cloudbrain.go View File

@@ -629,13 +629,13 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {

func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type").Where("user_id=? AND (status =? OR status=?)", userID, string(JobRunning), string(JobWaiting)).Find(&cloudBrains)
err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
return cloudBrains, err
}

func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type").Where("repo_id=? AND (status =? OR status=?)", repoID, string(JobRunning), string(JobWaiting)).Find(&cloudBrains)
err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
return cloudBrains, err
}



+ 23
- 2
routers/repo/cloudbrain.go View File

@@ -4,6 +4,7 @@ import (
"bufio"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
@@ -391,19 +392,39 @@ func StopJobs(cloudBrains []*models.Cloudbrain) {
for _, taskInfo := range cloudBrains {

if taskInfo.Type == models.TypeCloudBrainOne {
err := cloudbrain.StopJob(taskInfo.JobID)
err := retry(3, time.Second*30, func() error {
return cloudbrain.StopJob(taskInfo.JobID)
})

logErrorAndUpdateJobStatus(err, taskInfo)
} else {
param := models.NotebookAction{
Action: models.ActionStop,
}
_, err := modelarts.StopJob(taskInfo.JobID, param)
err := retry(3, time.Second*30, func() error {
_, err := modelarts.StopJob(taskInfo.JobID, param)
return err
})
logErrorAndUpdateJobStatus(err, taskInfo)
}

}
}

func retry(attempts int, sleep time.Duration, f func() error) (err error) {
for i := 0; i < attempts; i++ {
if i > 0 {
log.Warn("retrying after error:", err)
time.Sleep(sleep)
}
err = f()
if err == nil {
return nil
}
}
return fmt.Errorf("after %d attempts, last error: %s", attempts, err)
}

func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) {
if err != nil {
log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err)


Loading…
Cancel
Save