diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 4f615ed00..dd4e9a6f4 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -629,13 +629,13 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) - err := x.Cols("job_id", "status", "type").Where("user_id=? AND (status =? OR status=?)", userID, string(JobRunning), string(JobWaiting)).Find(&cloudBrains) + err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) return cloudBrains, err } func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) - err := x.Cols("job_id", "status", "type").Where("repo_id=? AND (status =? OR status=?)", repoID, string(JobRunning), string(JobWaiting)).Find(&cloudBrains) + err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains) return cloudBrains, err } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index ccd51a89f..0bafe24aa 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -4,6 +4,7 @@ import ( "bufio" "encoding/json" "errors" + "fmt" "io" "net/http" "os" @@ -391,19 +392,39 @@ func StopJobs(cloudBrains []*models.Cloudbrain) { for _, taskInfo := range cloudBrains { if taskInfo.Type == models.TypeCloudBrainOne { - err := cloudbrain.StopJob(taskInfo.JobID) + err := retry(3, time.Second*30, func() error { + return cloudbrain.StopJob(taskInfo.JobID) + }) + logErrorAndUpdateJobStatus(err, taskInfo) } else { param := models.NotebookAction{ Action: models.ActionStop, } - _, err := modelarts.StopJob(taskInfo.JobID, param) + err := retry(3, time.Second*30, func() error { + _, err := modelarts.StopJob(taskInfo.JobID, param) + return err + }) logErrorAndUpdateJobStatus(err, taskInfo) } } } +func retry(attempts int, sleep time.Duration, f func() error) (err error) { + for i := 0; i < attempts; i++ { + if i > 0 { + log.Warn("retrying after error:", err) + time.Sleep(sleep) + } + err = f() + if err == nil { + return nil + } + } + return fmt.Errorf("after %d attempts, last error: %s", attempts, err) +} + func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) { if err != nil { log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err)