diff --git a/models/ai_model_manage.go b/models/ai_model_manage.go index d59e0528a..663b08527 100644 --- a/models/ai_model_manage.go +++ b/models/ai_model_manage.go @@ -85,6 +85,13 @@ type AiModelQueryOptions struct { Status int } +func (a *AiModelConvert) IsGpuTrainTask() bool { + if a.SrcEngine == 0 || a.SrcEngine == 1 { + return true + } + return false +} + func ModelComputeAndSetDuration(task *AiModelConvert, result JobResultPayload) { if task.StartTime == 0 { task.StartTime = timeutil.TimeStamp(result.JobStatus.CreatedTime / 1000) diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index aabdc44f8..9d1b1fb0f 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -113,7 +113,7 @@ func GetCloudbrainModelConvertTask(ctx *context.APIContext) { log.Error("GetCloudbrainByID failed:", err) return } - if job.SrcEngine == 0 { + if job.IsGpuTrainTask() { jobResult, err := cloudbrain.GetJob(job.CloudBrainTaskId) if err != nil { ctx.NotFound(err) @@ -271,7 +271,7 @@ func CloudBrainModelConvertList(ctx *context.APIContext) { log.Error("GetCloudbrainByJobID(%s) failed:%v", job.Name, err.Error()) return } - if job.SrcEngine == 0 { + if job.IsGpuTrainTask() { //get dirs dirs, err := routerRepo.GetModelDirs(job.ID, parentDir) if err != nil { diff --git a/routers/repo/ai_model_convert.go b/routers/repo/ai_model_convert.go index 4221435da..0d5eb87a0 100644 --- a/routers/repo/ai_model_convert.go +++ b/routers/repo/ai_model_convert.go @@ -17,6 +17,7 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/timeutil" uuid "github.com/satori/go.uuid" ) @@ -101,9 +102,9 @@ func SaveModelConvert(ctx *context.Context) { UserId: ctx.User.ID, } models.SaveModelConvert(modelConvert) - if modelConvert.SrcEngine == PYTORCH_ENGINE || modelConvert.SrcEngine == TENSORFLOW_ENGINE { + if modelConvert.IsGpuTrainTask() { log.Info("create gpu train job.") - err = createGpuTrainJob(modelConvert, ctx, task.Path) + err = createGpuTrainJob(modelConvert, ctx, task) } else { //create npu job log.Info("create npu train job.") @@ -267,15 +268,77 @@ func downloadConvertCode(repopath string, codePath, branchName string) error { return nil } -func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error { +func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error { + path := Model_prefix + models.AttachmentRelativePath(task.ID) + "/" + allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path) + if err == nil { + _, errState := os.Stat(localPath) + if errState != nil { + if err = os.MkdirAll(localPath, os.ModePerm); err != nil { + return err + } + } + for _, oneFile := range allFile { + if oneFile.IsDir { + log.Info(" dir name:" + oneFile.FileName) + } else { + fDest, err := os.Create(localPath + "/" + oneFile.FileName) + if err != nil { + log.Info("create file error, download file failed: %s\n", err.Error()) + return err + } + body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName) + if err != nil { + log.Info("download file failed: %s\n", err.Error()) + return err + } else { + defer body.Close() + p := make([]byte, 1024) + var readErr error + var readCount int + // 读取对象内容 + for { + readCount, readErr = body.Read(p) + if readCount > 0 { + fDest.Write(p[:readCount]) + } + if readErr != nil { + break + } + } + } + } + } + } else { + log.Info("error,msg=" + err.Error()) + return err + } + return nil +} + +func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error { + modelRelativePath := model.Path command := "" IMAGE_URL := GPU_PYTORCH_IMAGE + dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath + if modelConvert.SrcEngine == PYTORCH_ENGINE { command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchBootFile) } else if modelConvert.SrcEngine == TENSORFLOW_ENGINE { IMAGE_URL = GPU_TENSORFLOW_IMAGE command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, TensorFlowGpuBootFile) + //如果模型在OBS上,需要下载到本地,并上传到minio中 + if model.Type == models.TypeCloudBrainTwo { + relatetiveModelPath := setting.JobPath + modelConvert.ID + "/dataset" + log.Info("local dataset path:" + relatetiveModelPath) + downloadFromObsToLocal(model, relatetiveModelPath) + uploadCodeToMinio(relatetiveModelPath+"/", modelConvert.ID, "/dataset/") + dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset" + } + } + log.Info("dataActualPath=" + dataActualPath) + log.Info("command=" + command) codePath := setting.JobPath + modelConvert.ID + CodeMountPath @@ -294,9 +357,6 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model" log.Info("minio model path=" + minioModelPath) - dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath - log.Info("dataActualPath=" + dataActualPath) - if TrainResourceSpecs == nil { json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) } @@ -391,13 +451,6 @@ func DeleteModelConvert(ctx *context.Context) { } } -func isCloudBrainTask(task *models.AiModelConvert) bool { - if task.SrcEngine == PYTORCH_ENGINE || task.SrcEngine == TENSORFLOW_ENGINE { - return true - } - return false -} - func StopModelConvert(ctx *context.Context) { id := ctx.Params(":id") log.Info("stop model convert start.id=" + id) @@ -406,7 +459,7 @@ func StopModelConvert(ctx *context.Context) { ctx.ServerError("Not found task.", err) return } - if isCloudBrainTask(job) { + if job.IsGpuTrainTask() { err = cloudbrain.StopJob(job.CloudBrainTaskId) if err != nil { log.Error("Stop cloudbrain Job(%s) failed:%v", job.CloudBrainTaskId, err) @@ -448,7 +501,7 @@ func ShowModelConvertInfo(ctx *context.Context) { job.UserRelAvatarLink = user.RelAvatarLink() } - if isCloudBrainTask(job) { + if job.IsGpuTrainTask() { ctx.Data["npu_display"] = "none" ctx.Data["gpu_display"] = "block" result, err := cloudbrain.GetJob(job.CloudBrainTaskId)