Browse Source

提交代码。

Signed-off-by: zouap <zouap@pcl.ac.cn>
tags/v1.22.7.1
zouap 3 years ago
parent
commit
6ff71defe6
3 changed files with 77 additions and 17 deletions
  1. +7
    -0
      models/ai_model_manage.go
  2. +2
    -2
      routers/api/v1/repo/cloudbrain.go
  3. +68
    -15
      routers/repo/ai_model_convert.go

+ 7
- 0
models/ai_model_manage.go View File

@@ -85,6 +85,13 @@ type AiModelQueryOptions struct {
Status int
}

func (a *AiModelConvert) IsGpuTrainTask() bool {
if a.SrcEngine == 0 || a.SrcEngine == 1 {
return true
}
return false
}

func ModelComputeAndSetDuration(task *AiModelConvert, result JobResultPayload) {
if task.StartTime == 0 {
task.StartTime = timeutil.TimeStamp(result.JobStatus.CreatedTime / 1000)


+ 2
- 2
routers/api/v1/repo/cloudbrain.go View File

@@ -113,7 +113,7 @@ func GetCloudbrainModelConvertTask(ctx *context.APIContext) {
log.Error("GetCloudbrainByID failed:", err)
return
}
if job.SrcEngine == 0 {
if job.IsGpuTrainTask() {
jobResult, err := cloudbrain.GetJob(job.CloudBrainTaskId)
if err != nil {
ctx.NotFound(err)
@@ -271,7 +271,7 @@ func CloudBrainModelConvertList(ctx *context.APIContext) {
log.Error("GetCloudbrainByJobID(%s) failed:%v", job.Name, err.Error())
return
}
if job.SrcEngine == 0 {
if job.IsGpuTrainTask() {
//get dirs
dirs, err := routerRepo.GetModelDirs(job.ID, parentDir)
if err != nil {


+ 68
- 15
routers/repo/ai_model_convert.go View File

@@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/modelarts"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/timeutil"
uuid "github.com/satori/go.uuid"
)
@@ -101,9 +102,9 @@ func SaveModelConvert(ctx *context.Context) {
UserId: ctx.User.ID,
}
models.SaveModelConvert(modelConvert)
if modelConvert.SrcEngine == PYTORCH_ENGINE || modelConvert.SrcEngine == TENSORFLOW_ENGINE {
if modelConvert.IsGpuTrainTask() {
log.Info("create gpu train job.")
err = createGpuTrainJob(modelConvert, ctx, task.Path)
err = createGpuTrainJob(modelConvert, ctx, task)
} else {
//create npu job
log.Info("create npu train job.")
@@ -267,15 +268,77 @@ func downloadConvertCode(repopath string, codePath, branchName string) error {
return nil
}

func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error {
func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error {
path := Model_prefix + models.AttachmentRelativePath(task.ID) + "/"
allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
if err == nil {
_, errState := os.Stat(localPath)
if errState != nil {
if err = os.MkdirAll(localPath, os.ModePerm); err != nil {
return err
}
}
for _, oneFile := range allFile {
if oneFile.IsDir {
log.Info(" dir name:" + oneFile.FileName)
} else {
fDest, err := os.Create(localPath + "/" + oneFile.FileName)
if err != nil {
log.Info("create file error, download file failed: %s\n", err.Error())
return err
}
body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
if err != nil {
log.Info("download file failed: %s\n", err.Error())
return err
} else {
defer body.Close()
p := make([]byte, 1024)
var readErr error
var readCount int
// 读取对象内容
for {
readCount, readErr = body.Read(p)
if readCount > 0 {
fDest.Write(p[:readCount])
}
if readErr != nil {
break
}
}
}
}
}
} else {
log.Info("error,msg=" + err.Error())
return err
}
return nil
}

func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error {
modelRelativePath := model.Path
command := ""
IMAGE_URL := GPU_PYTORCH_IMAGE
dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath

if modelConvert.SrcEngine == PYTORCH_ENGINE {
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchBootFile)
} else if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
IMAGE_URL = GPU_TENSORFLOW_IMAGE
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, TensorFlowGpuBootFile)
//如果模型在OBS上,需要下载到本地,并上传到minio中
if model.Type == models.TypeCloudBrainTwo {
relatetiveModelPath := setting.JobPath + modelConvert.ID + "/dataset"
log.Info("local dataset path:" + relatetiveModelPath)
downloadFromObsToLocal(model, relatetiveModelPath)
uploadCodeToMinio(relatetiveModelPath+"/", modelConvert.ID, "/dataset/")
dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset"
}

}
log.Info("dataActualPath=" + dataActualPath)

log.Info("command=" + command)

codePath := setting.JobPath + modelConvert.ID + CodeMountPath
@@ -294,9 +357,6 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context
minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model"
log.Info("minio model path=" + minioModelPath)

dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath
log.Info("dataActualPath=" + dataActualPath)

if TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
}
@@ -391,13 +451,6 @@ func DeleteModelConvert(ctx *context.Context) {
}
}

func isCloudBrainTask(task *models.AiModelConvert) bool {
if task.SrcEngine == PYTORCH_ENGINE || task.SrcEngine == TENSORFLOW_ENGINE {
return true
}
return false
}

func StopModelConvert(ctx *context.Context) {
id := ctx.Params(":id")
log.Info("stop model convert start.id=" + id)
@@ -406,7 +459,7 @@ func StopModelConvert(ctx *context.Context) {
ctx.ServerError("Not found task.", err)
return
}
if isCloudBrainTask(job) {
if job.IsGpuTrainTask() {
err = cloudbrain.StopJob(job.CloudBrainTaskId)
if err != nil {
log.Error("Stop cloudbrain Job(%s) failed:%v", job.CloudBrainTaskId, err)
@@ -448,7 +501,7 @@ func ShowModelConvertInfo(ctx *context.Context) {
job.UserRelAvatarLink = user.RelAvatarLink()
}

if isCloudBrainTask(job) {
if job.IsGpuTrainTask() {
ctx.Data["npu_display"] = "none"
ctx.Data["gpu_display"] = "block"
result, err := cloudbrain.GetJob(job.CloudBrainTaskId)


Loading…
Cancel
Save