Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/1714 Reviewed-by: lewis <747342561@qq.com>tags/v1.22.3.2^2
| @@ -1,6 +1,7 @@ | |||
| package models | |||
| import ( | |||
| "code.gitea.io/gitea/modules/util" | |||
| "encoding/json" | |||
| "fmt" | |||
| "strconv" | |||
| @@ -102,15 +103,15 @@ type Cloudbrain struct { | |||
| ContainerIp string | |||
| CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` | |||
| UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` | |||
| Duration int64 | |||
| TrainJobDuration string | |||
| Image string //镜像名称 | |||
| GpuQueue string //GPU类型即GPU队列 | |||
| ResourceSpecId int //GPU规格id | |||
| DeletedAt time.Time `xorm:"deleted"` | |||
| CanDebug bool `xorm:"-"` | |||
| CanDel bool `xorm:"-"` | |||
| CanModify bool `xorm:"-"` | |||
| Duration int64 `xorm:"DEFAULT 0"` //运行时长 单位秒 | |||
| TrainJobDuration string `xorm:"DEFAULT '00:00:00'"` | |||
| Image string //镜像名称 | |||
| GpuQueue string //GPU类型即GPU队列 | |||
| ResourceSpecId int //GPU规格id | |||
| DeletedAt time.Time `xorm:"deleted"` | |||
| CanDebug bool `xorm:"-"` | |||
| CanDel bool `xorm:"-"` | |||
| CanModify bool `xorm:"-"` | |||
| Type int | |||
| BenchmarkTypeID int | |||
| BenchmarkChildTypeID int | |||
| @@ -150,6 +151,44 @@ type Cloudbrain struct { | |||
| Repo *Repository `xorm:"-"` | |||
| BenchmarkTypeName string `xorm:"-"` | |||
| BenchmarkTypeRankLink string `xorm:"-"` | |||
| StartTime timeutil.TimeStamp | |||
| EndTime timeutil.TimeStamp | |||
| } | |||
| func (task *Cloudbrain) ComputeAndSetDuration() { | |||
| var d int64 | |||
| if task.StartTime == 0 { | |||
| d = 0 | |||
| } else if task.EndTime == 0 { | |||
| d = time.Now().Unix() - task.StartTime.AsTime().Unix() | |||
| } else { | |||
| d = task.EndTime.AsTime().Unix() - task.StartTime.AsTime().Unix() | |||
| } | |||
| if d < 0 { | |||
| d = 0 | |||
| } | |||
| task.Duration = d | |||
| task.TrainJobDuration = ConvertDurationToStr(d) | |||
| } | |||
| func ConvertDurationToStr(duration int64) string { | |||
| if duration == 0 { | |||
| return "00:00:00" | |||
| } | |||
| return util.AddZero(duration/3600) + ":" + util.AddZero(duration%3600/60) + ":" + util.AddZero(duration%60) | |||
| } | |||
| func IsTrainJobTerminal(status string) bool { | |||
| return status == string(ModelArtsTrainJobCompleted) || status == string(ModelArtsTrainJobFailed) || status == string(ModelArtsTrainJobKilled) | |||
| } | |||
| func IsModelArtsDebugJobTerminal(status string) bool { | |||
| return status == string(ModelArtsStopped) | |||
| } | |||
| func IsCloudBrainOneDebugJobTerminal(status string) bool { | |||
| return status == string(JobStopped) || status == string(JobFailed) || status == string(JobSucceeded) | |||
| } | |||
| type CloudbrainInfo struct { | |||
| @@ -1019,6 +1058,7 @@ type GetTrainJobResult struct { | |||
| NasShareAddr string `json:"nas_share_addr"` | |||
| DatasetName string | |||
| ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话 | |||
| StartTime int64 `json:"start_time"` //训练作业开始时间。 | |||
| } | |||
| type GetTrainJobLogResult struct { | |||
| @@ -1327,13 +1367,13 @@ func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string | |||
| func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { | |||
| cloudBrains := make([]*Cloudbrain, 0) | |||
| err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) | |||
| err := x.Cols("job_id", "status", "type", "job_type", "version_id", "start_time").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) | |||
| return cloudBrains, err | |||
| } | |||
| func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) { | |||
| cloudBrains := make([]*Cloudbrain, 0) | |||
| err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains) | |||
| err := x.Cols("job_id", "status", "type", "job_type", "version_id", "start_time").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains) | |||
| return cloudBrains, err | |||
| } | |||
| @@ -1377,7 +1417,7 @@ func UpdateTrainJobVersion(job *Cloudbrain) error { | |||
| func updateJobTrainVersion(e Engine, job *Cloudbrain) error { | |||
| var sess *xorm.Session | |||
| sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName) | |||
| _, err := sess.Cols("status", "train_job_duration").Update(job) | |||
| _, err := sess.Cols("status", "train_job_duration", "duration", "start_time", "end_time").Update(job) | |||
| return err | |||
| } | |||
| @@ -1457,7 +1497,7 @@ func UpdateInferenceJob(job *Cloudbrain) error { | |||
| func updateInferenceJob(e Engine, job *Cloudbrain) error { | |||
| var sess *xorm.Session | |||
| sess = e.Where("job_id = ?", job.JobID) | |||
| _, err := sess.Cols("status", "train_job_duration").Update(job) | |||
| _, err := sess.Cols("status", "train_job_duration", "duration", "start_time", "end_time").Update(job) | |||
| return err | |||
| } | |||
| func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { | |||
| @@ -6,6 +6,7 @@ | |||
| package repo | |||
| import ( | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| "net/http" | |||
| "sort" | |||
| "time" | |||
| @@ -77,9 +78,17 @@ func GetCloudbrainTask(ctx *context.APIContext) { | |||
| job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||
| job.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||
| job.Status = taskRes.TaskStatuses[0].State | |||
| if job.StartTime == 0 && !taskRes.TaskStatuses[0].StartAt.IsZero() { | |||
| job.StartTime = timeutil.TimeStamp(taskRes.TaskStatuses[0].StartAt.Unix()) | |||
| } | |||
| } | |||
| if result.JobStatus.State != string(models.JobWaiting) { | |||
| if job.EndTime == 0 && models.IsCloudBrainOneDebugJobTerminal(job.Status) { | |||
| job.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| job.ComputeAndSetDuration() | |||
| err = models.UpdateJob(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| @@ -6,12 +6,11 @@ | |||
| package repo | |||
| import ( | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| "net/http" | |||
| "strconv" | |||
| "strings" | |||
| "code.gitea.io/gitea/modules/util" | |||
| "code.gitea.io/gitea/models" | |||
| "code.gitea.io/gitea/modules/context" | |||
| "code.gitea.io/gitea/modules/log" | |||
| @@ -67,8 +66,14 @@ func GetModelArtsNotebook2(ctx *context.APIContext) { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| if job.StartTime == 0 && result.Lease.CreateTime > 0 { | |||
| job.StartTime = timeutil.TimeStamp(result.Lease.CreateTime / 1000) | |||
| } | |||
| job.Status = result.Status | |||
| if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) { | |||
| job.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| job.ComputeAndSetDuration() | |||
| err = models.UpdateJob(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| @@ -133,16 +138,17 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| if job.StartTime == 0 && result.StartTime > 0 { | |||
| job.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| job.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| job.Duration = result.Duration | |||
| job.Duration = result.Duration / 1000 | |||
| job.TrainJobDuration = result.TrainJobDuration | |||
| if result.Duration != 0 { | |||
| job.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000) | |||
| job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) | |||
| } else { | |||
| job.TrainJobDuration = "00:00:00" | |||
| if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { | |||
| job.EndTime = job.StartTime.Add(job.Duration) | |||
| } | |||
| err = models.UpdateTrainJobVersion(job) | |||
| @@ -366,16 +372,17 @@ func GetModelArtsInferenceJob(ctx *context.APIContext) { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| if job.StartTime == 0 && result.StartTime > 0 { | |||
| job.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| job.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| job.Duration = result.Duration | |||
| job.Duration = result.Duration / 1000 | |||
| job.TrainJobDuration = result.TrainJobDuration | |||
| if result.Duration != 0 { | |||
| job.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000) | |||
| job.TrainJobDuration = models.ConvertDurationToStr(result.Duration) | |||
| } else { | |||
| job.TrainJobDuration = "00:00:00" | |||
| if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { | |||
| job.EndTime = job.StartTime.Add(job.Duration) | |||
| } | |||
| err = models.UpdateInferenceJob(job) | |||
| @@ -2,6 +2,7 @@ package repo | |||
| import ( | |||
| "bufio" | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| "encoding/json" | |||
| "errors" | |||
| "fmt" | |||
| @@ -373,6 +374,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName) { | |||
| task.Status = taskRes.TaskStatuses[0].State | |||
| task.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||
| task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||
| if task.StartTime == 0 && !taskRes.TaskStatuses[0].StartAt.IsZero() { | |||
| task.StartTime = timeutil.TimeStamp(taskRes.TaskStatuses[0].StartAt.Unix()) | |||
| } | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| ctx.Data["error"] = err.Error() | |||
| @@ -398,12 +402,6 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName) { | |||
| task.User = user | |||
| } | |||
| var duration int64 | |||
| if task.Status == string(models.JobRunning) { | |||
| duration = time.Now().Unix() - int64(task.CreatedUnix) | |||
| } else { | |||
| duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) | |||
| } | |||
| if task.BenchmarkTypeID > 0 { | |||
| for _, benchmarkType := range GetBenchmarkTypes(ctx).BenchmarkType { | |||
| if task.BenchmarkTypeID == benchmarkType.Id { | |||
| @@ -418,8 +416,16 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName) { | |||
| } | |||
| } | |||
| } | |||
| ctx.Data["duration"] = util.AddZero(duration/3600000) + ":" + util.AddZero(duration%3600000/60000) + ":" + util.AddZero(duration%60000/1000) | |||
| if task.TrainJobDuration == "" { | |||
| var duration int64 | |||
| if task.Status == string(models.JobRunning) { | |||
| duration = time.Now().Unix() - int64(task.CreatedUnix) | |||
| } else { | |||
| duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) | |||
| } | |||
| task.TrainJobDuration = models.ConvertDurationToStr(duration) | |||
| } | |||
| ctx.Data["duration"] = task.TrainJobDuration | |||
| ctx.Data["task"] = task | |||
| ctx.Data["jobName"] = task.JobName | |||
| ctx.Data["displayJobName"] = task.DisplayJobName | |||
| @@ -482,6 +488,10 @@ func CloudBrainStop(ctx *context.Context) { | |||
| } | |||
| task.Status = string(models.JobStopped) | |||
| if task.EndTime == 0 { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | |||
| @@ -575,6 +585,10 @@ func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) { | |||
| log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err) | |||
| } else { | |||
| taskInfo.Status = string(models.JobStopped) | |||
| if taskInfo.EndTime == 0 { | |||
| taskInfo.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| taskInfo.ComputeAndSetDuration() | |||
| err = models.UpdateJob(taskInfo) | |||
| if err != nil { | |||
| log.Warn("UpdateJob failed", err) | |||
| @@ -946,6 +960,13 @@ func SyncCloudbrainStatus() { | |||
| task.Status = taskRes.TaskStatuses[0].State | |||
| if task.Status != string(models.JobWaiting) { | |||
| task.Duration = time.Now().Unix() - taskRes.TaskStatuses[0].StartAt.Unix() | |||
| if task.StartTime == 0 && !taskRes.TaskStatuses[0].StartAt.IsZero() { | |||
| task.StartTime = timeutil.TimeStamp(taskRes.TaskStatuses[0].StartAt.Unix()) | |||
| } | |||
| if task.EndTime == 0 && models.IsCloudBrainOneDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| @@ -966,6 +987,10 @@ func SyncCloudbrainStatus() { | |||
| continue | |||
| } | |||
| task.Status = string(models.JobStopped) | |||
| if task.EndTime == 0 { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| @@ -984,7 +1009,13 @@ func SyncCloudbrainStatus() { | |||
| if result != nil { | |||
| task.Status = result.Status | |||
| if task.StartTime == 0 && result.Lease.CreateTime > 0 { | |||
| task.StartTime = timeutil.TimeStamp(result.Lease.CreateTime / 1000) | |||
| } | |||
| if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| @@ -1000,14 +1031,15 @@ func SyncCloudbrainStatus() { | |||
| if result != nil { | |||
| task.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| task.Duration = result.Duration | |||
| task.Duration = result.Duration / 1000 | |||
| task.TrainJobDuration = result.TrainJobDuration | |||
| if result.Duration != 0 { | |||
| task.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000) | |||
| } else { | |||
| task.TrainJobDuration = "00:00:00" | |||
| if task.StartTime == 0 && result.StartTime > 0 { | |||
| task.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) | |||
| if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { | |||
| task.EndTime = task.StartTime.Add(task.Duration) | |||
| } | |||
| err = models.UpdateJob(task) | |||
| @@ -1055,13 +1087,16 @@ func CloudBrainBenchmarkIndex(ctx *context.Context) { | |||
| for i, task := range ciTasks { | |||
| ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) | |||
| ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource | |||
| var duration int64 | |||
| if task.Status == string(models.JobRunning) { | |||
| duration = time.Now().Unix() - int64(task.Cloudbrain.CreatedUnix) | |||
| } else { | |||
| duration = int64(task.Cloudbrain.UpdatedUnix) - int64(task.Cloudbrain.CreatedUnix) | |||
| if ciTasks[i].TrainJobDuration == "" { | |||
| var duration int64 | |||
| if task.Status == string(models.JobRunning) { | |||
| duration = time.Now().Unix() - int64(task.Cloudbrain.CreatedUnix) | |||
| } else { | |||
| duration = int64(task.Cloudbrain.UpdatedUnix) - int64(task.Cloudbrain.CreatedUnix) | |||
| } | |||
| ciTasks[i].TrainJobDuration = models.ConvertDurationToStr(duration) | |||
| } | |||
| ciTasks[i].TrainJobDuration = util.AddZero(duration/3600000) + ":" + util.AddZero(duration%3600000/60000) + ":" + util.AddZero(duration%60000/1000) | |||
| ciTasks[i].BenchmarkTypeName = "" | |||
| if task.BenchmarkTypeID > 0 { | |||
| for _, benchmarkType := range GetBenchmarkTypes(ctx).BenchmarkType { | |||
| @@ -2,6 +2,7 @@ package repo | |||
| import ( | |||
| "archive/zip" | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| "encoding/json" | |||
| "errors" | |||
| "io" | |||
| @@ -408,6 +409,10 @@ func NotebookManage(ctx *context.Context) { | |||
| } | |||
| task.Status = res.Status | |||
| if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||