Browse Source

merge

tags/v1.22.3.2^2
lewis 3 years ago
parent
commit
ca0dbeb943
5 changed files with 145 additions and 47 deletions
  1. +53
    -13
      models/cloudbrain.go
  2. +9
    -0
      routers/api/v1/repo/cloudbrain.go
  3. +22
    -13
      routers/api/v1/repo/modelarts.go
  4. +56
    -21
      routers/repo/cloudbrain.go
  5. +5
    -0
      routers/repo/modelarts.go

+ 53
- 13
models/cloudbrain.go View File

@@ -1,6 +1,7 @@
package models package models


import ( import (
"code.gitea.io/gitea/modules/util"
"encoding/json" "encoding/json"
"fmt" "fmt"
"strconv" "strconv"
@@ -110,15 +111,15 @@ type Cloudbrain struct {
ContainerIp string ContainerIp string
CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
Duration int64
TrainJobDuration string
Image string //镜像名称
GpuQueue string //GPU类型即GPU队列
ResourceSpecId int //GPU规格id
DeletedAt time.Time `xorm:"deleted"`
CanDebug bool `xorm:"-"`
CanDel bool `xorm:"-"`
CanModify bool `xorm:"-"`
Duration int64 `xorm:"DEFAULT 0"` //运行时长 单位秒
TrainJobDuration string `xorm:"DEFAULT '00:00:00'"`
Image string //镜像名称
GpuQueue string //GPU类型即GPU队列
ResourceSpecId int //GPU规格id
DeletedAt time.Time `xorm:"deleted"`
CanDebug bool `xorm:"-"`
CanDel bool `xorm:"-"`
CanModify bool `xorm:"-"`
Type int Type int
BenchmarkTypeID int BenchmarkTypeID int
BenchmarkChildTypeID int BenchmarkChildTypeID int
@@ -158,6 +159,44 @@ type Cloudbrain struct {
Repo *Repository `xorm:"-"` Repo *Repository `xorm:"-"`
BenchmarkTypeName string `xorm:"-"` BenchmarkTypeName string `xorm:"-"`
BenchmarkTypeRankLink string `xorm:"-"` BenchmarkTypeRankLink string `xorm:"-"`
StartTime timeutil.TimeStamp
EndTime timeutil.TimeStamp
}

func (task *Cloudbrain) ComputeAndSetDuration() {
var d int64
if task.StartTime == 0 {
d = 0
} else if task.EndTime == 0 {
d = time.Now().Unix() - task.StartTime.AsTime().Unix()
} else {
d = task.EndTime.AsTime().Unix() - task.StartTime.AsTime().Unix()
}

if d < 0 {
d = 0
}
task.Duration = d
task.TrainJobDuration = ConvertDurationToStr(d)
}

func ConvertDurationToStr(duration int64) string {
if duration == 0 {
return "00:00:00"
}
return util.AddZero(duration/3600) + ":" + util.AddZero(duration%3600/60) + ":" + util.AddZero(duration%60)
}

func IsTrainJobTerminal(status string) bool {
return status == string(ModelArtsTrainJobCompleted) || status == string(ModelArtsTrainJobFailed) || status == string(ModelArtsTrainJobKilled)
}

func IsModelArtsDebugJobTerminal(status string) bool {
return status == string(ModelArtsStopped)
}

func IsCloudBrainOneDebugJobTerminal(status string) bool {
return status == string(JobStopped) || status == string(JobFailed) || status == string(JobSucceeded)
} }


type CloudbrainInfo struct { type CloudbrainInfo struct {
@@ -1027,6 +1066,7 @@ type GetTrainJobResult struct {
NasShareAddr string `json:"nas_share_addr"` NasShareAddr string `json:"nas_share_addr"`
DatasetName string DatasetName string
ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话 ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
StartTime int64 `json:"start_time"` //训练作业开始时间。
} }


type GetTrainJobLogResult struct { type GetTrainJobLogResult struct {
@@ -1335,13 +1375,13 @@ func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string


func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0) cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
err := x.Cols("job_id", "status", "type", "job_type", "version_id", "start_time").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
return cloudBrains, err return cloudBrains, err
} }


func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) { func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0) cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type", "job_type", "version_id").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
err := x.Cols("job_id", "status", "type", "job_type", "version_id", "start_time").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
return cloudBrains, err return cloudBrains, err
} }


@@ -1385,7 +1425,7 @@ func UpdateTrainJobVersion(job *Cloudbrain) error {
func updateJobTrainVersion(e Engine, job *Cloudbrain) error { func updateJobTrainVersion(e Engine, job *Cloudbrain) error {
var sess *xorm.Session var sess *xorm.Session
sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName) sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName)
_, err := sess.Cols("status", "train_job_duration").Update(job)
_, err := sess.Cols("status", "train_job_duration", "duration", "start_time", "end_time").Update(job)
return err return err
} }


@@ -1465,7 +1505,7 @@ func UpdateInferenceJob(job *Cloudbrain) error {
func updateInferenceJob(e Engine, job *Cloudbrain) error { func updateInferenceJob(e Engine, job *Cloudbrain) error {
var sess *xorm.Session var sess *xorm.Session
sess = e.Where("job_id = ?", job.JobID) sess = e.Where("job_id = ?", job.JobID)
_, err := sess.Cols("status", "train_job_duration").Update(job)
_, err := sess.Cols("status", "train_job_duration", "duration", "start_time", "end_time").Update(job)
return err return err
} }
func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) {


+ 9
- 0
routers/api/v1/repo/cloudbrain.go View File

@@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/timeutil"
routerRepo "code.gitea.io/gitea/routers/repo" routerRepo "code.gitea.io/gitea/routers/repo"
) )


@@ -80,9 +81,17 @@ func GetCloudbrainTask(ctx *context.APIContext) {
job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
job.ContainerID = taskRes.TaskStatuses[0].ContainerID job.ContainerID = taskRes.TaskStatuses[0].ContainerID
job.Status = taskRes.TaskStatuses[0].State job.Status = taskRes.TaskStatuses[0].State

if job.StartTime == 0 && !taskRes.TaskStatuses[0].StartAt.IsZero() {
job.StartTime = timeutil.TimeStamp(taskRes.TaskStatuses[0].StartAt.Unix())
}
} }


if result.JobStatus.State != string(models.JobWaiting) { if result.JobStatus.State != string(models.JobWaiting) {
if job.EndTime == 0 && models.IsCloudBrainOneDebugJobTerminal(job.Status) {
job.EndTime = timeutil.TimeStampNow()
}
job.ComputeAndSetDuration()
err = models.UpdateJob(job) err = models.UpdateJob(job)
if err != nil { if err != nil {
log.Error("UpdateJob failed:", err) log.Error("UpdateJob failed:", err)


+ 22
- 13
routers/api/v1/repo/modelarts.go View File

@@ -16,7 +16,7 @@ import (
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/modelarts"
"code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/modules/timeutil"
routerRepo "code.gitea.io/gitea/routers/repo" routerRepo "code.gitea.io/gitea/routers/repo"
) )


@@ -67,8 +67,14 @@ func GetModelArtsNotebook2(ctx *context.APIContext) {
ctx.NotFound(err) ctx.NotFound(err)
return return
} }

if job.StartTime == 0 && result.Lease.CreateTime > 0 {
job.StartTime = timeutil.TimeStamp(result.Lease.CreateTime / 1000)
}
job.Status = result.Status job.Status = result.Status
if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) {
job.EndTime = timeutil.TimeStampNow()
}
job.ComputeAndSetDuration()
err = models.UpdateJob(job) err = models.UpdateJob(job)
if err != nil { if err != nil {
log.Error("UpdateJob failed:", err) log.Error("UpdateJob failed:", err)
@@ -166,15 +172,17 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
return return
} }


if job.StartTime == 0 && result.StartTime > 0 {
job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
job.Status = modelarts.TransTrainJobStatus(result.IntStatus) job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
job.Duration = result.Duration
job.Duration = result.Duration / 1000
job.TrainJobDuration = result.TrainJobDuration job.TrainJobDuration = result.TrainJobDuration


if result.Duration != 0 {
job.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000)
job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)


} else {
job.TrainJobDuration = "00:00:00"
if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
job.EndTime = job.StartTime.Add(job.Duration)
} }


err = models.UpdateTrainJobVersion(job) err = models.UpdateTrainJobVersion(job)
@@ -399,16 +407,17 @@ func GetModelArtsInferenceJob(ctx *context.APIContext) {
ctx.NotFound(err) ctx.NotFound(err)
return return
} }

if job.StartTime == 0 && result.StartTime > 0 {
job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
job.Status = modelarts.TransTrainJobStatus(result.IntStatus) job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
job.Duration = result.Duration
job.Duration = result.Duration / 1000
job.TrainJobDuration = result.TrainJobDuration job.TrainJobDuration = result.TrainJobDuration


if result.Duration != 0 {
job.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000)
job.TrainJobDuration = models.ConvertDurationToStr(result.Duration)


} else {
job.TrainJobDuration = "00:00:00"
if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
job.EndTime = job.StartTime.Add(job.Duration)
} }


err = models.UpdateInferenceJob(job) err = models.UpdateInferenceJob(job)


+ 56
- 21
routers/repo/cloudbrain.go View File

@@ -2,6 +2,7 @@ package repo


import ( import (
"bufio" "bufio"
"code.gitea.io/gitea/modules/timeutil"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
@@ -422,6 +423,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
task.Status = taskRes.TaskStatuses[0].State task.Status = taskRes.TaskStatuses[0].State
task.ContainerID = taskRes.TaskStatuses[0].ContainerID task.ContainerID = taskRes.TaskStatuses[0].ContainerID
task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
if task.StartTime == 0 && !taskRes.TaskStatuses[0].StartAt.IsZero() {
task.StartTime = timeutil.TimeStamp(taskRes.TaskStatuses[0].StartAt.Unix())
}
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
ctx.Data["error"] = err.Error() ctx.Data["error"] = err.Error()
@@ -447,12 +451,6 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
task.User = user task.User = user
} }


var duration int64
if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.CreatedUnix)
} else {
duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
}
if task.BenchmarkTypeID > 0 { if task.BenchmarkTypeID > 0 {
for _, benchmarkType := range GetBenchmarkTypes(ctx).BenchmarkType { for _, benchmarkType := range GetBenchmarkTypes(ctx).BenchmarkType {
if task.BenchmarkTypeID == benchmarkType.Id { if task.BenchmarkTypeID == benchmarkType.Id {
@@ -467,8 +465,16 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
} }
} }
} }

ctx.Data["duration"] = util.AddZero(duration/3600000) + ":" + util.AddZero(duration%3600000/60000) + ":" + util.AddZero(duration%60000/1000)
if task.TrainJobDuration == "" {
var duration int64
if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.CreatedUnix)
} else {
duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
}
task.TrainJobDuration = models.ConvertDurationToStr(duration)
}
ctx.Data["duration"] = task.TrainJobDuration
ctx.Data["task"] = task ctx.Data["task"] = task
ctx.Data["jobName"] = task.JobName ctx.Data["jobName"] = task.JobName
ctx.Data["displayJobName"] = task.DisplayJobName ctx.Data["displayJobName"] = task.DisplayJobName
@@ -532,6 +538,10 @@ func CloudBrainStop(ctx *context.Context) {
} }


task.Status = string(models.JobStopped) task.Status = string(models.JobStopped)
if task.EndTime == 0 {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
@@ -626,6 +636,10 @@ func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) {
log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err) log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err)
} else { } else {
taskInfo.Status = string(models.JobStopped) taskInfo.Status = string(models.JobStopped)
if taskInfo.EndTime == 0 {
taskInfo.EndTime = timeutil.TimeStampNow()
}
taskInfo.ComputeAndSetDuration()
err = models.UpdateJob(taskInfo) err = models.UpdateJob(taskInfo)
if err != nil { if err != nil {
log.Warn("UpdateJob failed", err) log.Warn("UpdateJob failed", err)
@@ -997,6 +1011,13 @@ func SyncCloudbrainStatus() {
task.Status = taskRes.TaskStatuses[0].State task.Status = taskRes.TaskStatuses[0].State
if task.Status != string(models.JobWaiting) { if task.Status != string(models.JobWaiting) {
task.Duration = time.Now().Unix() - taskRes.TaskStatuses[0].StartAt.Unix() task.Duration = time.Now().Unix() - taskRes.TaskStatuses[0].StartAt.Unix()
if task.StartTime == 0 && !taskRes.TaskStatuses[0].StartAt.IsZero() {
task.StartTime = timeutil.TimeStamp(taskRes.TaskStatuses[0].StartAt.Unix())
}
if task.EndTime == 0 && models.IsCloudBrainOneDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err) log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -1017,6 +1038,10 @@ func SyncCloudbrainStatus() {
continue continue
} }
task.Status = string(models.JobStopped) task.Status = string(models.JobStopped)
if task.EndTime == 0 {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err) log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -1035,7 +1060,13 @@ func SyncCloudbrainStatus() {


if result != nil { if result != nil {
task.Status = result.Status task.Status = result.Status

if task.StartTime == 0 && result.Lease.CreateTime > 0 {
task.StartTime = timeutil.TimeStamp(result.Lease.CreateTime / 1000)
}
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err) log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -1051,14 +1082,15 @@ func SyncCloudbrainStatus() {


if result != nil { if result != nil {
task.Status = modelarts.TransTrainJobStatus(result.IntStatus) task.Status = modelarts.TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration
task.Duration = result.Duration / 1000
task.TrainJobDuration = result.TrainJobDuration task.TrainJobDuration = result.TrainJobDuration


if result.Duration != 0 {
task.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000)

} else {
task.TrainJobDuration = "00:00:00"
if task.StartTime == 0 && result.StartTime > 0 {
task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
} }


err = models.UpdateJob(task) err = models.UpdateJob(task)
@@ -1106,13 +1138,16 @@ func CloudBrainBenchmarkIndex(ctx *context.Context) {
for i, task := range ciTasks { for i, task := range ciTasks {
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
var duration int64
if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.Cloudbrain.CreatedUnix)
} else {
duration = int64(task.Cloudbrain.UpdatedUnix) - int64(task.Cloudbrain.CreatedUnix)
if ciTasks[i].TrainJobDuration == "" {
var duration int64
if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.Cloudbrain.CreatedUnix)
} else {
duration = int64(task.Cloudbrain.UpdatedUnix) - int64(task.Cloudbrain.CreatedUnix)
}
ciTasks[i].TrainJobDuration = models.ConvertDurationToStr(duration)
} }
ciTasks[i].TrainJobDuration = util.AddZero(duration/3600000) + ":" + util.AddZero(duration%3600000/60000) + ":" + util.AddZero(duration%60000/1000)
ciTasks[i].BenchmarkTypeName = "" ciTasks[i].BenchmarkTypeName = ""
if task.BenchmarkTypeID > 0 { if task.BenchmarkTypeID > 0 {
for _, benchmarkType := range GetBenchmarkTypes(ctx).BenchmarkType { for _, benchmarkType := range GetBenchmarkTypes(ctx).BenchmarkType {


+ 5
- 0
routers/repo/modelarts.go View File

@@ -2,6 +2,7 @@ package repo


import ( import (
"archive/zip" "archive/zip"
"code.gitea.io/gitea/modules/timeutil"
"encoding/json" "encoding/json"
"errors" "errors"
"io" "io"
@@ -414,6 +415,10 @@ func NotebookManage(ctx *context.Context) {
} }


task.Status = res.Status task.Status = res.Status
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])


Loading…
Cancel
Save