|
|
|
@@ -17,6 +17,7 @@ import ( |
|
|
|
"code.gitea.io/gitea/modules/log" |
|
|
|
"code.gitea.io/gitea/modules/modelarts" |
|
|
|
"code.gitea.io/gitea/modules/setting" |
|
|
|
"code.gitea.io/gitea/modules/storage" |
|
|
|
"code.gitea.io/gitea/modules/timeutil" |
|
|
|
uuid "github.com/satori/go.uuid" |
|
|
|
) |
|
|
|
@@ -101,9 +102,9 @@ func SaveModelConvert(ctx *context.Context) { |
|
|
|
UserId: ctx.User.ID, |
|
|
|
} |
|
|
|
models.SaveModelConvert(modelConvert) |
|
|
|
if modelConvert.SrcEngine == PYTORCH_ENGINE || modelConvert.SrcEngine == TENSORFLOW_ENGINE { |
|
|
|
if modelConvert.IsGpuTrainTask() { |
|
|
|
log.Info("create gpu train job.") |
|
|
|
err = createGpuTrainJob(modelConvert, ctx, task.Path) |
|
|
|
err = createGpuTrainJob(modelConvert, ctx, task) |
|
|
|
} else { |
|
|
|
//create npu job |
|
|
|
log.Info("create npu train job.") |
|
|
|
@@ -267,15 +268,77 @@ func downloadConvertCode(repopath string, codePath, branchName string) error { |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error { |
|
|
|
func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error { |
|
|
|
path := Model_prefix + models.AttachmentRelativePath(task.ID) + "/" |
|
|
|
allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path) |
|
|
|
if err == nil { |
|
|
|
_, errState := os.Stat(localPath) |
|
|
|
if errState != nil { |
|
|
|
if err = os.MkdirAll(localPath, os.ModePerm); err != nil { |
|
|
|
return err |
|
|
|
} |
|
|
|
} |
|
|
|
for _, oneFile := range allFile { |
|
|
|
if oneFile.IsDir { |
|
|
|
log.Info(" dir name:" + oneFile.FileName) |
|
|
|
} else { |
|
|
|
fDest, err := os.Create(localPath + "/" + oneFile.FileName) |
|
|
|
if err != nil { |
|
|
|
log.Info("create file error, download file failed: %s\n", err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName) |
|
|
|
if err != nil { |
|
|
|
log.Info("download file failed: %s\n", err.Error()) |
|
|
|
return err |
|
|
|
} else { |
|
|
|
defer body.Close() |
|
|
|
p := make([]byte, 1024) |
|
|
|
var readErr error |
|
|
|
var readCount int |
|
|
|
// 读取对象内容 |
|
|
|
for { |
|
|
|
readCount, readErr = body.Read(p) |
|
|
|
if readCount > 0 { |
|
|
|
fDest.Write(p[:readCount]) |
|
|
|
} |
|
|
|
if readErr != nil { |
|
|
|
break |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
log.Info("error,msg=" + err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error { |
|
|
|
modelRelativePath := model.Path |
|
|
|
command := "" |
|
|
|
IMAGE_URL := GPU_PYTORCH_IMAGE |
|
|
|
dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath |
|
|
|
|
|
|
|
if modelConvert.SrcEngine == PYTORCH_ENGINE { |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchBootFile) |
|
|
|
} else if modelConvert.SrcEngine == TENSORFLOW_ENGINE { |
|
|
|
IMAGE_URL = GPU_TENSORFLOW_IMAGE |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, TensorFlowGpuBootFile) |
|
|
|
//如果模型在OBS上,需要下载到本地,并上传到minio中 |
|
|
|
if model.Type == models.TypeCloudBrainTwo { |
|
|
|
relatetiveModelPath := setting.JobPath + modelConvert.ID + "/dataset" |
|
|
|
log.Info("local dataset path:" + relatetiveModelPath) |
|
|
|
downloadFromObsToLocal(model, relatetiveModelPath) |
|
|
|
uploadCodeToMinio(relatetiveModelPath+"/", modelConvert.ID, "/dataset/") |
|
|
|
dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset" |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
log.Info("dataActualPath=" + dataActualPath) |
|
|
|
|
|
|
|
log.Info("command=" + command) |
|
|
|
|
|
|
|
codePath := setting.JobPath + modelConvert.ID + CodeMountPath |
|
|
|
@@ -294,9 +357,6 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context |
|
|
|
minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model" |
|
|
|
log.Info("minio model path=" + minioModelPath) |
|
|
|
|
|
|
|
dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath |
|
|
|
log.Info("dataActualPath=" + dataActualPath) |
|
|
|
|
|
|
|
if TrainResourceSpecs == nil { |
|
|
|
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) |
|
|
|
} |
|
|
|
@@ -391,13 +451,6 @@ func DeleteModelConvert(ctx *context.Context) { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func isCloudBrainTask(task *models.AiModelConvert) bool { |
|
|
|
if task.SrcEngine == PYTORCH_ENGINE || task.SrcEngine == TENSORFLOW_ENGINE { |
|
|
|
return true |
|
|
|
} |
|
|
|
return false |
|
|
|
} |
|
|
|
|
|
|
|
func StopModelConvert(ctx *context.Context) { |
|
|
|
id := ctx.Params(":id") |
|
|
|
log.Info("stop model convert start.id=" + id) |
|
|
|
@@ -406,7 +459,7 @@ func StopModelConvert(ctx *context.Context) { |
|
|
|
ctx.ServerError("Not found task.", err) |
|
|
|
return |
|
|
|
} |
|
|
|
if isCloudBrainTask(job) { |
|
|
|
if job.IsGpuTrainTask() { |
|
|
|
err = cloudbrain.StopJob(job.CloudBrainTaskId) |
|
|
|
if err != nil { |
|
|
|
log.Error("Stop cloudbrain Job(%s) failed:%v", job.CloudBrainTaskId, err) |
|
|
|
@@ -448,7 +501,7 @@ func ShowModelConvertInfo(ctx *context.Context) { |
|
|
|
job.UserRelAvatarLink = user.RelAvatarLink() |
|
|
|
} |
|
|
|
|
|
|
|
if isCloudBrainTask(job) { |
|
|
|
if job.IsGpuTrainTask() { |
|
|
|
ctx.Data["npu_display"] = "none" |
|
|
|
ctx.Data["gpu_display"] = "block" |
|
|
|
result, err := cloudbrain.GetJob(job.CloudBrainTaskId) |
|
|
|
|