Browse Source

Merge branch 'V20211228' into fix-1095

tags/v1.21.12.2^2
zhoupzh 3 years ago
parent
commit
54c5f40522
11 changed files with 491 additions and 126 deletions
  1. +4
    -1
      models/cloudbrain.go
  2. +130
    -11
      modules/cloudbrain/cloudbrain.go
  3. +1
    -1
      modules/modelarts/modelarts.go
  4. +3
    -3
      modules/modelarts/resty.go
  5. +5
    -0
      modules/storage/minio.go
  6. +1
    -0
      options/locale/locale_zh-CN.ini
  7. +116
    -32
      routers/repo/cloudbrain.go
  8. +88
    -35
      routers/repo/modelarts.go
  9. +2
    -1
      routers/routes/routes.go
  10. +135
    -42
      templates/repo/debugjob/index.tmpl
  11. +6
    -0
      templates/repo/header.tmpl

+ 4
- 1
models/cloudbrain.go View File

@@ -88,6 +88,9 @@ type Cloudbrain struct {
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
Duration int64 Duration int64
TrainJobDuration string TrainJobDuration string
Image string //GPU镜像名称
GpuQueue string //GPU类型即GPU队列
ResourceSpecId int //GPU规格id
DeletedAt time.Time `xorm:"deleted"` DeletedAt time.Time `xorm:"deleted"`
CanDebug bool `xorm:"-"` CanDebug bool `xorm:"-"`
CanDel bool `xorm:"-"` CanDel bool `xorm:"-"`
@@ -102,7 +105,7 @@ type Cloudbrain struct {
IsLatestVersion string //是否是最新版本,1是,0否 IsLatestVersion string //是否是最新版本,1是,0否
CommitID string //提交的仓库代码id CommitID string //提交的仓库代码id
PreVersionName string //父版本名称 PreVersionName string //父版本名称
ComputeResource string //计算资源,例如npu
ComputeResource string `xorm:"-"` //计算资源,例如npu
EngineID int64 //引擎id EngineID int64 //引擎id


TrainUrl string //输出的obs路径 TrainUrl string //输出的obs路径


+ 130
- 11
modules/cloudbrain/cloudbrain.go View File

@@ -1,6 +1,8 @@
package cloudbrain package cloudbrain


import ( import (
"code.gitea.io/gitea/modules/storage"
"encoding/json"
"errors" "errors"
"strconv" "strconv"


@@ -107,6 +109,9 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath,
uuid uuid


var resourceSpec *models.ResourceSpec var resourceSpec *models.ResourceSpec
if ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
}
for _, spec := range ResourceSpecs.ResourceSpec { for _, spec := range ResourceSpecs.ResourceSpec {
if resourceSpecId == spec.Id { if resourceSpecId == spec.Id {
resourceSpec = spec resourceSpec = spec
@@ -185,28 +190,142 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath,
}, },
}) })
if err != nil { if err != nil {
log.Error("CreateJob failed:", err.Error())
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
return err return err
} }
if jobResult.Code != Success { if jobResult.Code != Success {
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg)
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
return errors.New(jobResult.Msg) return errors.New(jobResult.Msg)
} }


var jobID = jobResult.Payload["jobId"].(string) var jobID = jobResult.Payload["jobId"].(string)
err = models.CreateCloudbrain(&models.Cloudbrain{ err = models.CreateCloudbrain(&models.Cloudbrain{
Status: string(models.JobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobName: jobName,
SubTaskName: SubTaskName,
JobType: jobType,
Type: models.TypeCloudBrainOne,
Uuid: uuid,
Status: string(models.JobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobName: jobName,
SubTaskName: SubTaskName,
JobType: jobType,
Type: models.TypeCloudBrainOne,
Uuid: uuid,
Image: image,
GpuQueue: gpuQueue,
ResourceSpecId: resourceSpecId,
})

if err != nil {
return err
}

return nil
}

func RestartTask(ctx *context.Context, task *models.Cloudbrain) error {
dataActualPath := setting.Attachment.Minio.RealPath +
setting.Attachment.Minio.Bucket + "/" +
setting.Attachment.Minio.BasePath +
models.AttachmentRelativePath(task.Uuid) +
task.Uuid
jobName := task.JobName

var resourceSpec *models.ResourceSpec
if ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
}
for _, spec := range ResourceSpecs.ResourceSpec {
if task.ResourceSpecId == spec.Id {
resourceSpec = spec
}
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
return errors.New("no such resourceSpec")
}

jobResult, err := CreateJob(jobName, models.CreateJobParams{
JobName: jobName,
RetryCount: 1,
GpuType: task.GpuQueue,
Image: task.Image,
TaskRoles: []models.TaskRole{
{
Name: SubTaskName,
TaskNumber: 1,
MinSucceededTaskCount: 1,
MinFailedTaskCount: 1,
CPUNumber: resourceSpec.CpuNum,
GPUNumber: resourceSpec.GpuNum,
MemoryMB: resourceSpec.MemMiB,
ShmMB: resourceSpec.ShareMemMiB,
Command: Command,
NeedIBDevice: false,
IsMainRole: false,
UseNNI: false,
},
},
Volumes: []models.Volume{
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, CodeMountPath + "/"),
MountPath: CodeMountPath,
ReadOnly: false,
},
},
{
HostPath: models.StHostPath{
Path: dataActualPath,
MountPath: DataSetMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, ModelMountPath + "/"),
MountPath: ModelMountPath,
ReadOnly: false,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, BenchMarkMountPath + "/"),
MountPath: BenchMarkMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath + "/"),
MountPath: Snn4imagenetMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, BrainScoreMountPath + "/"),
MountPath: BrainScoreMountPath,
ReadOnly: true,
},
},
},
}) })
if err != nil {
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
return err
}
if jobResult.Code != Success {
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
return errors.New(jobResult.Msg)
}

var jobID = jobResult.Payload["jobId"].(string)
task.JobID = jobID
task.Status = string(models.JobWaiting)
err = models.UpdateJob(task)


if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
return err return err
} }




+ 1
- 1
modules/modelarts/modelarts.go View File

@@ -277,7 +277,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
DatasetName: attach.Name, DatasetName: attach.Name,
CommitID: req.CommitID, CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion, IsLatestVersion: req.IsLatestVersion,
ComputeResource: NPUResource,
//ComputeResource: NPUResource,
EngineID: req.EngineID, EngineID: req.EngineID,
TrainUrl: req.TrainUrl, TrainUrl: req.TrainUrl,
BranchName: req.BranchName, BranchName: req.BranchName,


+ 3
- 3
modules/modelarts/resty.go View File

@@ -174,7 +174,7 @@ sendjob:
return &result, nil return &result, nil
} }


func StopJob(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
checkSetting() checkSetting()
client := getRestyClient() client := getRestyClient()
var result models.NotebookActionResult var result models.NotebookActionResult
@@ -207,8 +207,8 @@ sendjob:
} }


if len(response.ErrorCode) != 0 { if len(response.ErrorCode) != 0 {
log.Error("StopJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("StopJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
} }


return &result, nil return &result, nil


+ 5
- 0
modules/storage/minio.go View File

@@ -11,6 +11,7 @@ import (
"strings" "strings"
"time" "time"


"code.gitea.io/gitea/modules/setting"
"github.com/minio/minio-go" "github.com/minio/minio-go"
) )


@@ -128,3 +129,7 @@ func (m *MinioStorage) UploadObject(fileName, filePath string) error {
_, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{}) _, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{})
return err return err
} }

func GetMinioPath(jobName, suffixPath string) string {
return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath
}

+ 1
- 0
options/locale/locale_zh-CN.ini View File

@@ -787,6 +787,7 @@ model_noright=无权限操作
model_rename=模型名称重复,请修改模型名称 model_rename=模型名称重复,请修改模型名称


debug=调试 debug=调试
debug_again=再次调试
stop=停止 stop=停止
delete=删除 delete=删除
model_download=模型下载 model_download=模型下载


+ 116
- 32
routers/repo/cloudbrain.go View File

@@ -206,7 +206,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
} }
repo := ctx.Repo.Repository repo := ctx.Repo.Repository
downloadCode(repo, codePath) downloadCode(repo, codePath)
uploadCodeToMinio(codePath+"/", jobName, "/code/")
uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")


modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
mkModelPath(modelPath) mkModelPath(modelPath)
@@ -236,9 +236,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/") uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/")
} }


err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId)
err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId)
if err != nil { if err != nil {
cloudBrainNewDataPrepare(ctx) cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form)
@@ -247,6 +248,72 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob") ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")
} }


func CloudBrainRestart(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var resultCode = "0"
var errorMsg = ""
var status = ""

for {
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}

if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not stopped"
break
}

if task.Image == "" || task.GpuQueue == "" || task.Type != models.TypeCloudBrainOne {
log.Error("the job(%s) version is too old", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job's version is too old and can not be restarted"
break
}

count, err := models.GetCloudbrainCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the user already has running or waiting task"
break
}
}

err = cloudbrain.RestartTask(ctx, task)
if err != nil {
log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}

status = task.Status
jobID = task.JobID

break
}

ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"job_id": jobID,
})
}

func CloudBrainShow(ctx *context.Context) { func CloudBrainShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true ctx.Data["PageIsCloudBrain"] = true


@@ -351,32 +418,53 @@ func CloudBrainCommitImage(ctx *context.Context, form auth.CommitImageCloudBrain


func CloudBrainStop(ctx *context.Context) { func CloudBrainStop(ctx *context.Context) {
var jobID = ctx.Params(":jobid") var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
ctx.ServerError("GetCloudbrainByJobID failed", err)
return
}
var resultCode = "0"
var errorMsg = ""
var status = ""


if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) {
log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
ctx.ServerError("the job has been stopped", errors.New("the job has been stopped"))
return
}
for {
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "system error"
break
}


err = cloudbrain.StopJob(jobID)
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"])
ctx.ServerError("StopJob failed", err)
return
}
if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) {
log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "system error"
break
}


task.Status = string(models.JobStopped)
err = models.UpdateJob(task)
if err != nil {
ctx.ServerError("UpdateJob failed", err)
return
err = cloudbrain.StopJob(jobID)
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "system error"
break
}

task.Status = string(models.JobStopped)
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "system error"
break
}

status = task.Status
break
} }
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")

ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"job_id": jobID,
})
} }


func StopJobsByUserID(userID int64) { func StopJobsByUserID(userID int64) {
@@ -423,7 +511,7 @@ func StopJobs(cloudBrains []*models.Cloudbrain) {
Action: models.ActionStop, Action: models.ActionStop,
} }
err := retry(3, time.Second*30, func() error { err := retry(3, time.Second*30, func() error {
_, err := modelarts.StopJob(taskInfo.JobID, param)
_, err := modelarts.ManageNotebook(taskInfo.JobID, param)
return err return err
}) })
logErrorAndUpdateJobStatus(err, taskInfo) logErrorAndUpdateJobStatus(err, taskInfo)
@@ -560,7 +648,7 @@ func getImages(ctx *context.Context, imageType string) {


func GetModelDirs(jobName string, parentDir string) (string, error) { func GetModelDirs(jobName string, parentDir string) (string, error) {
var req string var req string
modelActualPath := getMinioPath(jobName, cloudbrain.ModelMountPath+"/")
modelActualPath := storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")
if parentDir == "" { if parentDir == "" {
req = "baseDir=" + modelActualPath req = "baseDir=" + modelActualPath
} else { } else {
@@ -570,10 +658,6 @@ func GetModelDirs(jobName string, parentDir string) (string, error) {
return getDirs(req) return getDirs(req)
} }


func getMinioPath(jobName, suffixPath string) string {
return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath
}

func CloudBrainDownloadModel(ctx *context.Context) { func CloudBrainDownloadModel(ctx *context.Context) {
parentDir := ctx.Query("parentDir") parentDir := ctx.Query("parentDir")
fileName := ctx.Query("fileName") fileName := ctx.Query("fileName")


+ 88
- 35
routers/repo/modelarts.go View File

@@ -42,6 +42,7 @@ const (


func DebugJobIndex(ctx *context.Context) { func DebugJobIndex(ctx *context.Context) {
debugListType := ctx.Query("debugListType") debugListType := ctx.Query("debugListType")
ctx.Data["ListType"] = debugListType
MustEnableCloudbrain(ctx) MustEnableCloudbrain(ctx)
repo := ctx.Repo.Repository repo := ctx.Repo.Repository
page := ctx.QueryInt("page") page := ctx.QueryInt("page")
@@ -73,21 +74,19 @@ func DebugJobIndex(ctx *context.Context) {
} }


for i, task := range ciTasks { for i, task := range ciTasks {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)

if task.Cloudbrain.Type == models.TypeCloudBrainOne { if task.Cloudbrain.Type == models.TypeCloudBrainOne {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
ciTasks[i].Cloudbrain.ComputeResource = modelarts.GPUResource ciTasks[i].Cloudbrain.ComputeResource = modelarts.GPUResource
}
if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
} else if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
ciTasks[i].Cloudbrain.ComputeResource = modelarts.NPUResource ciTasks[i].Cloudbrain.ComputeResource = modelarts.NPUResource
} }

} }


pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
pager.SetDefaultParams(ctx)
//pager.SetDefaultParams(ctx)
pager.AddParam(ctx, "debugListType", "ListType")
ctx.Data["Page"] = pager ctx.Data["Page"] = pager
ctx.Data["PageIsCloudBrain"] = true ctx.Data["PageIsCloudBrain"] = true
ctx.Data["Tasks"] = ciTasks ctx.Data["Tasks"] = ciTasks
@@ -232,38 +231,91 @@ func NotebookDebug(ctx *context.Context) {
ctx.Redirect(debugUrl) ctx.Redirect(debugUrl)
} }


func NotebookStop(ctx *context.Context) {
func NotebookManage(ctx *context.Context) {
var jobID = ctx.Params(":jobid") var jobID = ctx.Params(":jobid")
log.Info(jobID)
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
ctx.ServerError("GetCloudbrainByJobID failed", err)
return
}
var action = ctx.Params(":action")
var resultCode = "0"
var errorMsg = ""
var status = ""


if task.Status != string(models.JobRunning) {
log.Error("the job(%s) is not running", task.JobName)
ctx.ServerError("the job is not running", errors.New("the job is not running"))
return
}
for {
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}


param := models.NotebookAction{
Action: models.ActionStop,
}
res, err := modelarts.StopJob(jobID, param)
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err.Error())
ctx.ServerError("StopJob failed", err)
return
}
if action == models.ActionStop {
if task.Status != string(models.ModelArtsRunning) {
log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not running"
break
}
} else if action == models.ActionRestart {
if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not stopped"
break
}


task.Status = res.CurrentStatus
err = models.UpdateJob(task)
if err != nil {
ctx.ServerError("UpdateJob failed", err)
return
count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have already a running or waiting task, can not create more"
break
}
}

action = models.ActionStart
} else {
log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "非法操作"
break
}

param := models.NotebookAction{
Action: action,
}
res, err := modelarts.ManageNotebook(jobID, param)
if err != nil {
log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "启动失败"
break
}

task.Status = res.CurrentStatus
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}

status = task.Status

break
} }
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")

ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"job_id": jobID,
})
} }


func NotebookDel(ctx *context.Context) { func NotebookDel(ctx *context.Context) {
@@ -323,6 +375,7 @@ func TrainJobIndex(ctx *context.Context) {
for i, task := range tasks { for i, task := range tasks {
tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
tasks[i].ComputeResource = modelarts.NPUResource
} }


pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)


+ 2
- 1
routers/routes/routes.go View File

@@ -968,6 +968,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/commit_image", cloudbrain.AdminOrOwnerOrJobCreaterRight, bindIgnErr(auth.CommitImageCloudBrainForm{}), repo.CloudBrainCommitImage) m.Post("/commit_image", cloudbrain.AdminOrOwnerOrJobCreaterRight, bindIgnErr(auth.CommitImageCloudBrainForm{}), repo.CloudBrainCommitImage)
m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDel) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDel)
m.Post("/restart", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainRestart)
m.Get("/rate", reqRepoCloudBrainReader, repo.GetRate) m.Get("/rate", reqRepoCloudBrainReader, repo.GetRate)
m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels)
m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDownloadModel) m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainDownloadModel)
@@ -1003,7 +1004,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Group("/:jobid", func() { m.Group("/:jobid", func() {
m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) m.Get("", reqRepoCloudBrainReader, repo.NotebookShow)
m.Get("/debug", reqRepoCloudBrainWriter, repo.NotebookDebug) m.Get("/debug", reqRepoCloudBrainWriter, repo.NotebookDebug)
m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookStop)
m.Post("/:action", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookManage)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel)
}) })
m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew) m.Get("/create", reqRepoCloudBrainWriter, repo.NotebookNew)


+ 135
- 42
templates/repo/debugjob/index.tmpl View File

@@ -202,14 +202,20 @@
<div class="rect5"></div> <div class="rect5"></div>
</div> </div>
</div> </div>
<!-- 提示框 -->
<div class="alert"></div>



<div class="repository release dataset-list view"> <div class="repository release dataset-list view">
{{template "repo/header" .}} {{template "repo/header" .}}
<!-- {{template "base/alert" .}} -->
<!-- 提示框 -->
<!-- 列表容器 --> <!-- 列表容器 -->
<div class="ui container"> <div class="ui container">
<div class="ui two column stackable grid ">
<div class="ui negative message" style="display: none;">
<i class="close icon"></i>
<p></p>
</div>
<div class="ui two column stackable grid">
<div class="column"> <div class="column">
<div class="ui blue small menu compact selectcloudbrain"> <div class="ui blue small menu compact selectcloudbrain">
<a class="active item" href="{{.RepoLink}}/debugjob?debugListType=all">{{$.i18n.Tr "repo.modelarts.notebook"}}</a> <a class="active item" href="{{.RepoLink}}/debugjob?debugListType=all">{{$.i18n.Tr "repo.modelarts.notebook"}}</a>
@@ -282,7 +288,7 @@
<div class="row"> <div class="row">
<!-- 任务名 --> <!-- 任务名 -->
<div class="four wide column"> <div class="four wide column">
<a class="title" href='{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}' title="{{.JobName}}" style="font-size: 14px;">
<a class="title" href='{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}' title="{{.JobName}}" style="font-size: 14px;">
<span class="fitted text_over" style="width: 90%;vertical-align: middle;">{{.JobName}}</span> <span class="fitted text_over" style="width: 90%;vertical-align: middle;">{{.JobName}}</span>
</a> </a>
</div> </div>
@@ -315,34 +321,44 @@
</a> </a>
{{end}} --> {{end}} -->
<!-- 调试 --> <!-- 调试 -->
{{if .CanDebug}}
{{if eq .ComputeResource "CPU/GPU"}}
<a id="model-debug-{{.JobID}}" class='ui basic {{if ne .Status "RUNNING"}} disabled {{else}}blue {{end}}button' href="{{$.RepoLink}}/cloudbrain/{{.JobID}}/debug" target="_blank">
{{$.i18n.Tr "repo.debug"}}
</a>
<form id="debugAgainForm-{{.JobID}}">
{{$.CsrfTokenHtml}}
{{if .CanDebug}}
{{if eq .Status "RUNNING"}}
<a style="margin: 0 1rem;" id="model-debug-{{.JobID}}" class='ui basic blue button' onclick='debugAgain("{{.JobID}}","{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/")'>
{{$.i18n.Tr "repo.debug"}}
</a>
{{else}}
<a id="model-debug-{{.JobID}}" class='ui basic {{if eq .Status "CREATING" "STOPPING" "WAITING" "STARTING"}} disabled {{else}}blue {{end}}button' onclick='debugAgain("{{.JobID}}","{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/")'>
{{$.i18n.Tr "repo.debug_again"}}
</a>
{{end}}
{{else}} {{else}}
<a id="model-debug-{{.JobID}}" class='ui basic {{if ne .Status "RUNNING"}} disabled {{else}}blue {{end}}button' href="{{$.RepoLink}}/modelarts/notebook/{{.JobID}}/debug" target="_blank">
{{$.i18n.Tr "repo.debug"}}
<a class="ui basic disabled button">
{{$.i18n.Tr "repo.debug_again"}}
</a> </a>
{{end}} {{end}}
{{else}}
<a class="ui basic disabled button">
{{$.i18n.Tr "repo.debug"}}
</a>
{{end}}
</form>
<!-- 停止 --> <!-- 停止 -->
<form id="stopForm-{{.JobID}}" action="{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/stop" method="post" style="margin-left:-1px;">
<form id="stopForm-{{.JobID}}" style="margin-left:-1px;">
{{$.CsrfTokenHtml}} {{$.CsrfTokenHtml}}
{{if .CanDel}} {{if .CanDel}}
<a id="stop-model-debug-{{.JobID}}" class='ui basic {{if eq .Status "STOPPED" "FAILED" "START_FAILED" "STOPPING" "CREATING" "STARTING"}}disabled {{else}}blue {{end}}button' onclick="document.getElementById('stopForm-{{.JobID}}').submit();">
{{$.i18n.Tr "repo.stop"}}
</a>
{{if eq .ComputeResource "CPU/GPU" }}
<a id="stop-model-debug-{{.JobID}}" class='ui basic {{if eq .Status "STOPPED" "FAILED" "START_FAILED" "STOPPING" "CREATING" "STARTING"}}disabled {{else}}blue {{end}}button' onclick='stopDebug("{{.JobID}}","{{$.RepoLink}}/cloudbrain/{{.JobID}}/stop")'>
{{$.i18n.Tr "repo.stop"}}
</a>
{{else}}
<a id="stop-model-debug-{{.JobID}}" class='ui basic {{if eq .Status "STOPPED" "FAILED" "START_FAILED" "STOPPING" "CREATING" "STARTING"}}disabled {{else}}blue {{end}}button' onclick='stopDebug("{{.JobID}}","{{$.RepoLink}}/modelarts/notebook/{{.JobID}}/stop")'>
{{$.i18n.Tr "repo.stop"}}
</a>
{{end}}
{{else}} {{else}}
<a class="ui basic disabled button" onclick="document.getElementById('stopForm-{{.JobID}}').submit();">
<a class="ui basic disabled button">
{{$.i18n.Tr "repo.stop"}} {{$.i18n.Tr "repo.stop"}}
</a> </a>
{{end}} {{end}}
<input type="hidden" name="debugListType" value="all">
</form> </form>
<!-- 删除 --> <!-- 删除 -->
<form id="delForm-{{.JobID}}" action="{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/del" method="post"> <form id="delForm-{{.JobID}}" action="{{if eq .ComputeResource "CPU/GPU"}}{{$.RepoLink}}/cloudbrain{{else}}{{$.RepoLink}}/modelarts/notebook{{end}}/{{.JobID}}/del" method="post">
@@ -463,10 +479,21 @@


<script> <script>
// 调试和评分新开窗口 // 调试和评分新开窗口
const {AppSubUrl, StaticUrlPrefix, csrf} = window.config;
let url={{.RepoLink}} let url={{.RepoLink}}
let getParam=location.search.split('?debugListType=').pop()
let getParam=getQueryVariable('debugListType')
let dropdownValue = getParam==='all'||getParam==='' ? '全部' : getParam let dropdownValue = getParam==='all'||getParam==='' ? '全部' : getParam
localStorage.setItem('all',location.href) localStorage.setItem('all',location.href)
function getQueryVariable(variable)
{
let query = window.location.search.substring(1);
let vars = query.split("&");
for (let i=0;i<vars.length;i++) {
let pair = vars[i].split("=");
if(pair[0] == variable){return pair[1];}
}
return(false);
}
function stop(obj) { function stop(obj) {
if (obj.style.color != "rgb(204, 204, 204)") { if (obj.style.color != "rgb(204, 204, 204)") {
obj.target = '_blank' obj.target = '_blank'
@@ -499,7 +526,68 @@
.modal('show') .modal('show')
} }
} }
function debugAgain(JobID,debugUrl){
if($('#' + JobID+ '-text').text()==="RUNNING"){
window.open(debugUrl+'debug')
}else{
$.ajax({
type:"POST",
url:debugUrl+'restart',
data:$('#debugAgainForm-'+JobID).serialize(),
success:function(res){
if(res.result_code==="0"){
if(res.job_id!==JobID){
location.reload()
}else{
$('#' + JobID+'-icon').removeClass().addClass(res.status)
$('#' + JobID+ '-text').text(res.status)
$('#model-debug-'+JobID).removeClass('blue').addClass('disabled')
$('#model-delete-'+JobID).removeClass('blue').addClass('disabled')
}
}else{
$(".ui.negative.message").css("display","block")
$(".ui.negative.message p").text(res.error_msg)
setTimeout("$('.message .close').click()",3000)
}
},
error :function(res){
console.log(res)


}
})
}
}
function stopDebug(JobID,stopUrl){
$.ajax({
type:"POST",
url:stopUrl,
data:$('#stopForm-'+JobID).serialize(),
success:function(res){
if(res.result_code==="0"){
$('#' + JobID+'-icon').removeClass().addClass(res.status)
$('#' + JobID+ '-text').text(res.status)
if(res.status==="STOPPED"){
$('#model-debug-'+JobID).removeClass('blue').addClass('disabled').text("再次调试").css("margin","0")
$('#model-image-'+JobID).removeClass('blue').addClass('disabled')
$('#stop-model-debug-'+JobID).removeClass('blue').addClass('disabled')
}
else{
$('#model-debug-'+JobID).removeClass('blue').addClass('disabled')
$('#stop-model-debug-'+JobID).removeClass('blue').addClass('disabled')
}
}else{
$("ui.negative.message").text(res.error_msg)
}
},
error :function(res){
console.log(res)

}
})

}
// 加载任务状态 // 加载任务状态
var timeid = window.setInterval(loadJobStatus, 15000); var timeid = window.setInterval(loadJobStatus, 15000);
$(document).ready(loadJobStatus); $(document).ready(loadJobStatus);
@@ -508,8 +596,9 @@
const jobID = job.dataset.jobid; const jobID = job.dataset.jobid;
const repoPath = job.dataset.repopath; const repoPath = job.dataset.repopath;
const computeResource = job.dataset.resource const computeResource = job.dataset.resource
const initArray = ['STOPPED','FAILED','START_FAILED','CREATE_FAILED']
const initArray = ['STOPPED','FAILED','START_FAILED','CREATE_FAILED','SUCCEEDED']
if (initArray.includes(job.textContent.trim())) { if (initArray.includes(job.textContent.trim())) {
return return
} }
const diffResource = computeResource == "NPU" ? 'modelarts/notebook' : 'cloudbrain' const diffResource = computeResource == "NPU" ? 'modelarts/notebook' : 'cloudbrain'
@@ -521,32 +610,30 @@
$('#' + jobID+ '-text').text(status) $('#' + jobID+ '-text').text(status)
} }
if(status==="RUNNING"){ if(status==="RUNNING"){
$('#model-debug-'+jobID).removeClass('disabled')
$('#model-debug-'+jobID).addClass('blue')
$('#model-image-'+jobID).removeClass('disabled')
$('#model-image-'+jobID).addClass('blue')
$('#model-debug-'+jobID).removeClass('disabled').addClass('blue').text('调试').css("margin","0 1rem")
$('#model-image-'+jobID).removeClass('disabled').addClass('blue')
} }
if(status!=="RUNNING"){ if(status!=="RUNNING"){
$('#model-debug-'+jobID).removeClass('blue')
$('#model-debug-'+jobID).addClass('disabled')
$('#model-image-'+jobID).removeClass('blue')
$('#model-image-'+jobID).addClass('disabled')

// $('#model-debug-'+jobID).removeClass('blue')
// $('#model-debug-'+jobID).addClass('disabled')
$('#model-image-'+jobID).removeClass('blue').addClass('disabled')
}
if(["CREATING","STOPPING","WAITING","STARTING"].includes(status)){
$('#model-debug-'+jobID).removeClass('blue').addClass('disabled')
}
if(['STOPPED','FAILED','START_FAILED','CREATE_FAILED','SUCCEEDED'].includes(status)){
$('#model-debug-'+jobID).removeClass('disabled').addClass('blue').text('再次调试').css("margin","0")
} }
if(["RUNNING","WAITING"].includes(status)){ if(["RUNNING","WAITING"].includes(status)){
$('#stop-model-debug-'+jobID).removeClass('disabled')
$('#stop-model-debug-'+jobID).addClass('blue')
$('#stop-model-debug-'+jobID).removeClass('disabled').addClass('blue')
} }
if(["CREATING","STOPPING","STARTING","STOPPED","FAILED","START_FAILED"].includes(status)){ if(["CREATING","STOPPING","STARTING","STOPPED","FAILED","START_FAILED"].includes(status)){
$('#stop-model-debug-'+jobID).removeClass('blue')
$('#stop-model-debug-'+jobID).addClass('disabled')
$('#stop-model-debug-'+jobID).removeClass('blue').addClass('disabled')
} }
if(status==="STOPPED" || status==="FAILED"|| status==="START_FAILED"){ if(status==="STOPPED" || status==="FAILED"|| status==="START_FAILED"){
$('#model-delete-'+jobID).removeClass('disabled')
$('#model-delete-'+jobID).addClass('blue')
$('#model-delete-'+jobID).removeClass('disabled').addClass('blue')
}else{ }else{
$('#model-delete-'+jobID).removeClass('blue')
$('#model-delete-'+jobID).addClass('disabled')
$('#model-delete-'+jobID).removeClass('blue').addClass('disabled')
} }
}).fail(function(err) { }).fail(function(err) {
console.log(err); console.log(err);
@@ -554,6 +641,7 @@
}); });
}; };
$(document).ready(function(){ $(document).ready(function(){
dropdownValue = dropdownValue==="CPU%2FGPU"? 'CPU/GPU' : dropdownValue
$('.default.text').text(dropdownValue) $('.default.text').text(dropdownValue)
$('.ui.dropdown') $('.ui.dropdown')
.dropdown({ .dropdown({
@@ -564,6 +652,12 @@
location.href = `${url}/debugjob?debugListType=${value}` location.href = `${url}/debugjob?debugListType=${value}`
} }
}) })
$('.message .close')
.on('click', function() {
$(this)
.closest('.message')
.transition('fade')
})
}) })
@@ -601,7 +695,6 @@
// 显示弹窗,弹出相应的信息 // 显示弹窗,弹出相应的信息
function showmask() { function showmask() {
var image_tag = !$('#image_tag').val() var image_tag = !$('#image_tag').val()
console.log("image_tag",image_tag)
if(image_tag){ if(image_tag){
return return
} }


+ 6
- 0
templates/repo/header.tmpl View File

@@ -149,6 +149,12 @@
{{.i18n.Tr "repo.model_manager"}} {{.i18n.Tr "repo.model_manager"}}
</a> </a>
{{end}} {{end}}
{{if .Permission.CanRead $.UnitTypeModelManage}}
<a class="{{if .isModelManage}}active{{end}} item" href="{{.RepoLink}}/modelmanage/show_model">
<svg class="svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="16" height="16"><path fill="none" d="M0 0h24v24H0z"/><path d="M3.741 1.408l18.462 10.154a.5.5 0 0 1 0 .876L3.741 22.592A.5.5 0 0 1 3 22.154V1.846a.5.5 0 0 1 .741-.438zM5 13v6.617L18.85 12 5 4.383V11h5v2H5z"/></svg>
{{.i18n.Tr "repo.model_manager"}}
</a>
{{end}}
{{if .Permission.CanRead $.UnitTypeCloudBrain}} {{if .Permission.CanRead $.UnitTypeCloudBrain}}
<a class="{{if .PageIsCloudBrain}}active{{end}} item" href="{{.RepoLink}}/debugjob?debugListType=all"> <a class="{{if .PageIsCloudBrain}}active{{end}} item" href="{{.RepoLink}}/debugjob?debugListType=all">
<span> <span>


Loading…
Cancel
Save