| @@ -3,6 +3,7 @@ package repo | |||||
| import ( | import ( | ||||
| "encoding/json" | "encoding/json" | ||||
| "errors" | "errors" | ||||
| "strings" | |||||
| "code.gitea.io/gitea/models" | "code.gitea.io/gitea/models" | ||||
| "code.gitea.io/gitea/modules/cloudbrain" | "code.gitea.io/gitea/modules/cloudbrain" | ||||
| @@ -90,99 +91,111 @@ func SaveModelConvert(ctx *context.Context) { | |||||
| func createTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error { | func createTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error { | ||||
| repo, _ := models.GetRepositoryByID(ctx.Repo.Repository.ID) | repo, _ := models.GetRepositoryByID(ctx.Repo.Repository.ID) | ||||
| command := "" | |||||
| if modelConvert.SrcEngine == PYTORCH_ENGINE { | if modelConvert.SrcEngine == PYTORCH_ENGINE { | ||||
| command = getPytorchModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert) | |||||
| } | |||||
| log.Info("command=" + command) | |||||
| codePath := setting.JobPath + modelConvert.ID + CodeMountPath | |||||
| downloadCode(repo, codePath, DefaultBranchName) | |||||
| codePath := setting.JobPath + modelConvert.ID + CodeMountPath | |||||
| downloadCode(repo, codePath, DefaultBranchName) | |||||
| uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/") | |||||
| log.Info("minio code path=" + setting.CBCodePathPrefix + modelConvert.ID) | |||||
| uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/") | |||||
| minioCodePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/code" | |||||
| log.Info("Volume codePath=" + minioCodePath) | |||||
| minioCodePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/code" | |||||
| log.Info("minio codePath=" + minioCodePath) | |||||
| modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/" | |||||
| log.Info("modelPath=" + modelPath) | |||||
| mkModelPath(modelPath) | |||||
| modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/" | |||||
| log.Info("local modelPath=" + modelPath) | |||||
| mkModelPath(modelPath) | |||||
| uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/") | |||||
| uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/") | |||||
| minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model" | |||||
| log.Info("minio model path=" + minioModelPath) | |||||
| command := getModelConvertCommand(modelConvert.ID, modelConvert.ModelPath) | |||||
| log.Info("command=" + command) | |||||
| dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath | |||||
| log.Info("dataActualPath=" + dataActualPath) | |||||
| dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath | |||||
| log.Info("dataActualPath=" + dataActualPath) | |||||
| if TrainResourceSpecs == nil { | |||||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | |||||
| } | |||||
| resourceSpec := TrainResourceSpecs.ResourceSpec[1] | |||||
| jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{ | |||||
| JobName: modelConvert.ID, | |||||
| RetryCount: 1, | |||||
| GpuType: GpuQueue, | |||||
| Image: GPU_PYTORCH_IMAGE, | |||||
| TaskRoles: []models.TaskRole{ | |||||
| { | |||||
| Name: SubTaskName, | |||||
| TaskNumber: 1, | |||||
| MinSucceededTaskCount: 1, | |||||
| MinFailedTaskCount: 1, | |||||
| CPUNumber: resourceSpec.CpuNum, | |||||
| GPUNumber: resourceSpec.GpuNum, | |||||
| MemoryMB: resourceSpec.MemMiB, | |||||
| ShmMB: resourceSpec.ShareMemMiB, | |||||
| Command: command, | |||||
| NeedIBDevice: false, | |||||
| IsMainRole: false, | |||||
| UseNNI: false, | |||||
| }, | |||||
| if TrainResourceSpecs == nil { | |||||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | |||||
| } | |||||
| resourceSpec := TrainResourceSpecs.ResourceSpec[1] | |||||
| jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{ | |||||
| JobName: modelConvert.ID, | |||||
| RetryCount: 1, | |||||
| GpuType: GpuQueue, | |||||
| Image: GPU_PYTORCH_IMAGE, | |||||
| TaskRoles: []models.TaskRole{ | |||||
| { | |||||
| Name: SubTaskName, | |||||
| TaskNumber: 1, | |||||
| MinSucceededTaskCount: 1, | |||||
| MinFailedTaskCount: 1, | |||||
| CPUNumber: resourceSpec.CpuNum, | |||||
| GPUNumber: resourceSpec.GpuNum, | |||||
| MemoryMB: resourceSpec.MemMiB, | |||||
| ShmMB: resourceSpec.ShareMemMiB, | |||||
| Command: command, | |||||
| NeedIBDevice: false, | |||||
| IsMainRole: false, | |||||
| UseNNI: false, | |||||
| }, | }, | ||||
| Volumes: []models.Volume{ | |||||
| { | |||||
| HostPath: models.StHostPath{ | |||||
| Path: minioCodePath, | |||||
| MountPath: CodeMountPath, | |||||
| ReadOnly: false, | |||||
| }, | |||||
| }, | |||||
| Volumes: []models.Volume{ | |||||
| { | |||||
| HostPath: models.StHostPath{ | |||||
| Path: minioCodePath, | |||||
| MountPath: CodeMountPath, | |||||
| ReadOnly: false, | |||||
| }, | }, | ||||
| { | |||||
| HostPath: models.StHostPath{ | |||||
| Path: dataActualPath, | |||||
| MountPath: DataSetMountPath, | |||||
| ReadOnly: true, | |||||
| }, | |||||
| }, | |||||
| { | |||||
| HostPath: models.StHostPath{ | |||||
| Path: dataActualPath, | |||||
| MountPath: DataSetMountPath, | |||||
| ReadOnly: true, | |||||
| }, | }, | ||||
| { | |||||
| HostPath: models.StHostPath{ | |||||
| Path: modelPath, | |||||
| MountPath: ModelMountPath, | |||||
| ReadOnly: false, | |||||
| }, | |||||
| }, | |||||
| { | |||||
| HostPath: models.StHostPath{ | |||||
| Path: minioModelPath, | |||||
| MountPath: ModelMountPath, | |||||
| ReadOnly: false, | |||||
| }, | }, | ||||
| }, | }, | ||||
| }) | |||||
| if err != nil { | |||||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||||
| return err | |||||
| } | |||||
| if jobResult.Code != Success { | |||||
| log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"]) | |||||
| return errors.New(jobResult.Msg) | |||||
| } | |||||
| var jobID = jobResult.Payload["jobId"].(string) | |||||
| log.Info("jobId=" + jobID) | |||||
| models.UpdateModelConvertCBTI(modelConvert.ID, jobID) | |||||
| }, | |||||
| }) | |||||
| if err != nil { | |||||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||||
| return err | |||||
| } | |||||
| if jobResult.Code != Success { | |||||
| log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"]) | |||||
| return errors.New(jobResult.Msg) | |||||
| } | } | ||||
| var jobID = jobResult.Payload["jobId"].(string) | |||||
| log.Info("jobId=" + jobID) | |||||
| models.UpdateModelConvertCBTI(modelConvert.ID, jobID) | |||||
| return nil | return nil | ||||
| } | } | ||||
| func getModelConvertCommand(name string, modelFile string) string { | |||||
| func getPytorchModelConvertCommand(name string, modelFile string, modelConvert *models.AiModelConvert) string { | |||||
| var command string | var command string | ||||
| bootFile := "convert_pytorch.py" | bootFile := "convert_pytorch.py" | ||||
| command += "python3 /code/" + bootFile + " --model " + modelFile + " > " + ModelMountPath + "/" + name + "-" + LogFile | |||||
| intputshape := strings.Split(modelConvert.InputShape, ",") | |||||
| n := "256" | |||||
| c := "1" | |||||
| h := "28" | |||||
| w := "28" | |||||
| if len(intputshape) == 4 { | |||||
| n = intputshape[0] | |||||
| c = intputshape[1] | |||||
| h = intputshape[2] | |||||
| w = intputshape[3] | |||||
| } | |||||
| command += "python3 /code/" + bootFile + " --model " + modelFile + " --n " + n + " --c " + c + " --h " + h + " --w " + w + " > " + ModelMountPath + "/" + name + "-" + LogFile | |||||
| return command | return command | ||||
| } | } | ||||