Browse Source

debug

tags/v1.21.12.2^2
lewis 4 years ago
parent
commit
e2f8b5507a
6 changed files with 145 additions and 30 deletions
  1. +3
    -0
      models/cloudbrain.go
  2. +123
    -11
      modules/cloudbrain/cloudbrain.go
  3. +5
    -0
      modules/storage/minio.go
  4. +8
    -14
      routers/repo/cloudbrain.go
  5. +6
    -5
      routers/repo/modelarts.go
  6. +0
    -0
      templates/repo/debugjob/index.tmpl

+ 3
- 0
models/cloudbrain.go View File

@@ -88,6 +88,9 @@ type Cloudbrain struct {
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
Duration int64
TrainJobDuration string
Image string //GPU镜像名称
GpuQueue string //GPU类型即GPU队列
ResourceSpecId int //GPU规格id
DeletedAt time.Time `xorm:"deleted"`
CanDebug bool `xorm:"-"`
CanDel bool `xorm:"-"`


+ 123
- 11
modules/cloudbrain/cloudbrain.go View File

@@ -1,6 +1,7 @@
package cloudbrain

import (
"code.gitea.io/gitea/modules/storage"
"errors"
"strconv"

@@ -185,25 +186,28 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath,
},
})
if err != nil {
log.Error("CreateJob failed:", err.Error())
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
return err
}
if jobResult.Code != Success {
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg)
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
return errors.New(jobResult.Msg)
}

var jobID = jobResult.Payload["jobId"].(string)
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: string(models.JobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobName: jobName,
SubTaskName: SubTaskName,
JobType: jobType,
Type: models.TypeCloudBrainOne,
Uuid: uuid,
Status: string(models.JobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobName: jobName,
SubTaskName: SubTaskName,
JobType: jobType,
Type: models.TypeCloudBrainOne,
Uuid: uuid,
Image: image,
GpuQueue: gpuQueue,
ResourceSpecId: resourceSpecId,
})

if err != nil {
@@ -212,3 +216,111 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath,

return nil
}

func RestartTask(ctx *context.Context, task *models.Cloudbrain) error {
dataActualPath := setting.Attachment.Minio.RealPath +
setting.Attachment.Minio.Bucket + "/" +
setting.Attachment.Minio.BasePath +
models.AttachmentRelativePath(task.Uuid) +
task.Uuid
jobName := task.JobName

var resourceSpec *models.ResourceSpec
for _, spec := range ResourceSpecs.ResourceSpec {
if task.ResourceSpecId == spec.Id {
resourceSpec = spec
}
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
return errors.New("no such resourceSpec")
}

jobResult, err := CreateJob(jobName, models.CreateJobParams{
JobName: jobName,
RetryCount: 1,
GpuType: task.GpuQueue,
Image: task.Image,
TaskRoles: []models.TaskRole{
{
Name: SubTaskName,
TaskNumber: 1,
MinSucceededTaskCount: 1,
MinFailedTaskCount: 1,
CPUNumber: resourceSpec.CpuNum,
GPUNumber: resourceSpec.GpuNum,
MemoryMB: resourceSpec.MemMiB,
ShmMB: resourceSpec.ShareMemMiB,
Command: Command,
NeedIBDevice: false,
IsMainRole: false,
UseNNI: false,
},
},
Volumes: []models.Volume{
{
HostPath: models.StHostPath{
Path: setting.JobPath + jobName + CodeMountPath,
MountPath: CodeMountPath,
ReadOnly: false,
},
},
{
HostPath: models.StHostPath{
Path: dataActualPath,
MountPath: DataSetMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, ModelMountPath + "/"),
MountPath: ModelMountPath,
ReadOnly: false,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, BenchMarkMountPath + "/"),
MountPath: BenchMarkMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath + "/"),
MountPath: Snn4imagenetMountPath,
ReadOnly: true,
},
},
{
HostPath: models.StHostPath{
Path: storage.GetMinioPath(jobName, BrainScoreMountPath + "/"),
MountPath: BrainScoreMountPath,
ReadOnly: true,
},
},
},
})
if err != nil {
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
return err
}
if jobResult.Code != Success {
log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
return errors.New(jobResult.Msg)
}

var jobID = jobResult.Payload["jobId"].(string)
task.JobID = jobID
task.Status = string(models.JobWaiting)
err = models.UpdateJob(task)

if err != nil {
log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
return err
}

return nil
}

+ 5
- 0
modules/storage/minio.go View File

@@ -11,6 +11,7 @@ import (
"strings"
"time"

"code.gitea.io/gitea/modules/setting"
"github.com/minio/minio-go"
)

@@ -128,3 +129,7 @@ func (m *MinioStorage) UploadObject(fileName, filePath string) error {
_, err := m.client.FPutObject(m.bucket, fileName, filePath, minio.PutObjectOptions{})
return err
}

func GetMinioPath(jobName, suffixPath string) string {
return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath
}

+ 8
- 14
routers/repo/cloudbrain.go View File

@@ -236,9 +236,9 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
uploadCodeToMinio(brainScorePath+"/", jobName, cloudbrain.BrainScoreMountPath+"/")
}

err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId)
err = cloudbrain.GenerateTask(ctx, jobName, image, command, uuid, codePath, storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId)
if err != nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form)
@@ -256,6 +256,8 @@ func CloudBrainRestart(ctx *context.Context) {
3、更新此任务的状态
*/

//todo: 是否启用事务?

var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
@@ -283,13 +285,9 @@ func CloudBrainRestart(ctx *context.Context) {
}
}

/*jobName := task.JobName
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
err = cloudbrain.GenerateTask(ctx, jobName, image, cloudbrain.Command, task.Uuid, codePath, getMinioPath(jobName, cloudbrain.ModelMountPath+"/"),
getMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), getMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
getMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, resourceSpecId)
*/
err = cloudbrain.RestartTask(ctx, task)
if err != nil {
log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"])
ctx.RenderWithErr(err.Error(), tplCloudBrainIndex, nil)
return
}
@@ -609,7 +607,7 @@ func getImages(ctx *context.Context, imageType string) {

func GetModelDirs(jobName string, parentDir string) (string, error) {
var req string
modelActualPath := getMinioPath(jobName, cloudbrain.ModelMountPath+"/")
modelActualPath := storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")
if parentDir == "" {
req = "baseDir=" + modelActualPath
} else {
@@ -619,10 +617,6 @@ func GetModelDirs(jobName string, parentDir string) (string, error) {
return getDirs(req)
}

func getMinioPath(jobName, suffixPath string) string {
return setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + jobName + suffixPath
}

func CloudBrainDownloadModel(ctx *context.Context) {
parentDir := ctx.Query("parentDir")
fileName := ctx.Query("fileName")


+ 6
- 5
routers/repo/modelarts.go View File

@@ -245,13 +245,13 @@ func NotebookManage(ctx *context.Context) {
if action == models.ActionStop {
if task.Status != string(models.ModelArtsRunning) {
log.Error("the job(%s) is not running", task.JobName)
ctx.ServerError("the job is not running", errors.New("the job is not running"))
ctx.RenderWithErr("the job is not running", tplDebugJobIndex, nil)
return
}
} else if action == models.ActionRestart {
if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
log.Error("the job(%s) is not stopped", task.JobName)
ctx.ServerError("the job is not running", errors.New("the job is not running"))
ctx.RenderWithErr("the job is not stopped", tplDebugJobIndex, nil)
return
}

@@ -269,7 +269,7 @@ func NotebookManage(ctx *context.Context) {
}
} else {
log.Error("the action(%s) is illegal", action)
ctx.ServerError("the action is illegal", errors.New("the action is illegal"))
ctx.RenderWithErr("非法操作", tplDebugJobIndex, nil)
return
}

@@ -279,14 +279,15 @@ func NotebookManage(ctx *context.Context) {
res, err := modelarts.ManageNotebook(jobID, param)
if err != nil {
log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error())
ctx.ServerError("ManageNotebook failed", err)
ctx.RenderWithErr("启动失败", tplDebugJobIndex, nil)
return
}

task.Status = res.CurrentStatus
err = models.UpdateJob(task)
if err != nil {
ctx.ServerError("UpdateJob failed", err)
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr("system error", tplDebugJobIndex, nil)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob")


+ 0
- 0
templates/repo/debugjob/index.tmpl View File


Loading…
Cancel
Save