package repo import ( "encoding/json" "errors" "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" uuid "github.com/satori/go.uuid" ) const ( tplModelManageConvertIndex = "repo/modelmanage/convertIndex" tplModelConvertInfo = "repo/modelmanage/convertshowinfo" PYTORCH_ENGINE = 0 TENSORFLOW_ENGINE = 1 MINDSPORE_ENGIN = 2 ModelMountPath = "/model" CodeMountPath = "/code" DataSetMountPath = "/dataset" LogFile = "log.txt" DefaultBranchName = "master" SubTaskName = "task1" GpuQueue = "openidgx" Success = "S000" GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap" ) var ( TrainResourceSpecs *models.ResourceSpecs ) func SaveModelConvert(ctx *context.Context) { log.Info("save model convert start.") if !ctx.Repo.CanWrite(models.UnitTypeModelManage) { ctx.JSON(403, ctx.Tr("repo.model_noright")) return } name := ctx.Query("name") desc := ctx.Query("desc") modelId := ctx.Query("modelId") modelPath := ctx.Query("ModelFile") SrcEngine := ctx.QueryInt("SrcEngine") InputShape := ctx.Query("inputshape") InputDataFormat := ctx.Query("inputdataformat") DestFormat := ctx.QueryInt("DestFormat") NetOutputFormat := ctx.QueryInt("NetOutputFormat") task, err := models.QueryModelById(modelId) if err != nil { log.Error("no such model!", err.Error()) ctx.ServerError("no such model:", err) return } uuid := uuid.NewV4() id := uuid.String() modelConvert := &models.AiModelConvert{ ID: id, Name: name, Description: desc, Status: string(models.JobWaiting), SrcEngine: SrcEngine, RepoId: ctx.Repo.Repository.ID, ModelName: task.Name, ModelVersion: task.Version, ModelId: modelId, ModelPath: modelPath, DestFormat: DestFormat, NetOutputFormat: NetOutputFormat, InputShape: InputShape, InputDataFormat: InputDataFormat, UserId: ctx.User.ID, } models.SaveModelConvert(modelConvert) err = createTrainJob(modelConvert, ctx, task.Path) if err == nil { ctx.JSON(200, map[string]string{ "result_code": "0", }) } else { ctx.JSON(200, map[string]string{ "result_code": "1", }) } } func createTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error { repo, _ := models.GetRepositoryByID(ctx.Repo.Repository.ID) if modelConvert.SrcEngine == PYTORCH_ENGINE { codePath := setting.JobPath + modelConvert.ID + CodeMountPath downloadCode(repo, codePath, DefaultBranchName) uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/") log.Info("minio code path=" + setting.CBCodePathPrefix + modelConvert.ID) minioCodePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/code" log.Info("Volume codePath=" + minioCodePath) modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/" log.Info("modelPath=" + modelPath) mkModelPath(modelPath) uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/") command := getModelConvertCommand(modelConvert.ID, modelConvert.ModelPath) log.Info("command=" + command) dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath log.Info("dataActualPath=" + dataActualPath) if TrainResourceSpecs == nil { json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) } resourceSpec := TrainResourceSpecs.ResourceSpec[1] jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{ JobName: modelConvert.ID, RetryCount: 1, GpuType: GpuQueue, Image: GPU_PYTORCH_IMAGE, TaskRoles: []models.TaskRole{ { Name: SubTaskName, TaskNumber: 1, MinSucceededTaskCount: 1, MinFailedTaskCount: 1, CPUNumber: resourceSpec.CpuNum, GPUNumber: resourceSpec.GpuNum, MemoryMB: resourceSpec.MemMiB, ShmMB: resourceSpec.ShareMemMiB, Command: command, NeedIBDevice: false, IsMainRole: false, UseNNI: false, }, }, Volumes: []models.Volume{ { HostPath: models.StHostPath{ Path: minioCodePath, MountPath: CodeMountPath, ReadOnly: false, }, }, { HostPath: models.StHostPath{ Path: dataActualPath, MountPath: DataSetMountPath, ReadOnly: true, }, }, { HostPath: models.StHostPath{ Path: modelPath, MountPath: ModelMountPath, ReadOnly: false, }, }, }, }) if err != nil { log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) return err } if jobResult.Code != Success { log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"]) return errors.New(jobResult.Msg) } var jobID = jobResult.Payload["jobId"].(string) log.Info("jobId=" + jobID) models.UpdateModelConvertCBTI(modelConvert.ID, jobID) } return nil } func getModelConvertCommand(name string, modelFile string) string { var command string bootFile := "convert_pytorch.py" command += "python3 /code/" + bootFile + " --model " + modelFile + " > " + ModelMountPath + "/" + name + "-" + LogFile return command } func DeleteModelConvert(ctx *context.Context) { log.Info("delete model convert start.") id := ctx.Params(":id") err := models.DeleteModelConvertById(id) if err != nil { ctx.JSON(500, err.Error()) } else { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model") } } func StopModelConvert(ctx *context.Context) { id := ctx.Params(":id") log.Info("stop model convert start.id=" + id) } func ShowModelConvertInfo(ctx *context.Context) { ctx.Data["ID"] = ctx.Query("ID") ctx.Data["isModelManage"] = true ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage) job, err := models.QueryModelConvertById(ctx.Query("ID")) if err == nil { ctx.Data["task"] = job } result, err := cloudbrain.GetJob(job.CloudBrainTaskId) if err != nil { log.Info("error:" + err.Error()) ctx.Data["error"] = err.Error() return } if result != nil { jobRes, _ := models.ConvertToJobResultPayload(result.Payload) ctx.Data["result"] = jobRes taskRoles := jobRes.TaskRoles taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) ctx.Data["taskRes"] = taskRes ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics job.Status = jobRes.JobStatus.State if jobRes.JobStatus.State != string(models.JobWaiting) && jobRes.JobStatus.State != string(models.JobFailed) { job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP job.ContainerID = taskRes.TaskStatuses[0].ContainerID job.Status = taskRes.TaskStatuses[0].State } if jobRes.JobStatus.State != string(models.JobWaiting) { models.ModelComputeAndSetDuration(job, jobRes) err = models.UpdateModelConvert(job) if err != nil { log.Error("UpdateModelConvert failed:", err) } } } ctx.HTML(200, tplModelConvertInfo) } func ConvertModelTemplate(ctx *context.Context) { ctx.Data["isModelManage"] = true ctx.Data["MODEL_COUNT"] = 0 ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage) ctx.Data["TRAIN_COUNT"] = 0 ShowModelConvertPageInfo(ctx) ctx.HTML(200, tplModelManageConvertIndex) } func ShowModelConvertPageInfo(ctx *context.Context) { log.Info("ShowModelConvertInfo start.") if !isQueryRight(ctx) { log.Info("no right.") ctx.NotFound(ctx.Req.URL.RequestURI(), nil) return } page := ctx.QueryInt("page") if page <= 0 { page = 1 } pageSize := ctx.QueryInt("pageSize") if pageSize <= 0 { pageSize = setting.UI.IssuePagingNum } repoId := ctx.Repo.Repository.ID modelResult, count, err := models.QueryModelConvert(&models.AiModelQueryOptions{ ListOptions: models.ListOptions{ Page: page, PageSize: pageSize, }, RepoID: repoId, }) if err != nil { log.Info("query db error." + err.Error()) ctx.ServerError("Cloudbrain", err) return } userIds := make([]int64, len(modelResult)) for i, model := range modelResult { model.IsCanOper = isOper(ctx, model.UserId) model.IsCanDelete = isCanDelete(ctx, model.UserId) userIds[i] = model.UserId } userNameMap := queryUserName(userIds) for _, model := range modelResult { value := userNameMap[model.UserId] if value != nil { model.UserName = value.Name model.UserRelAvatarLink = value.RelAvatarLink() } } pager := context.NewPagination(int(count), page, pageSize, 5) ctx.Data["Page"] = pager ctx.Data["Tasks"] = modelResult }