|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- package repo
-
- import (
- "encoding/json"
- "errors"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/cloudbrain"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/setting"
- uuid "github.com/satori/go.uuid"
- )
-
- const (
- tplModelManageConvertIndex = "repo/modelmanage/convertIndex"
- tplModelConvertInfo = "repo/modelmanage/convertshowinfo"
- PYTORCH_ENGINE = 0
- TENSORFLOW_ENGINE = 1
- MINDSPORE_ENGIN = 2
- ModelMountPath = "/model"
- CodeMountPath = "/code"
- DataSetMountPath = "/dataset"
- LogFile = "log.txt"
- DefaultBranchName = "master"
- SubTaskName = "task1"
- GpuQueue = "openidebug"
- Success = "S000"
- GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap"
- )
-
- var (
- TrainResourceSpecs *models.ResourceSpecs
- )
-
- func SaveModelConvert(ctx *context.Context) {
- log.Info("save model convert start.")
- if !ctx.Repo.CanWrite(models.UnitTypeModelManage) {
- ctx.JSON(403, ctx.Tr("repo.model_noright"))
- return
- }
- name := ctx.Query("name")
- desc := ctx.Query("desc")
- modelId := ctx.Query("modelId")
- modelPath := ctx.Query("ModelFile")
- SrcEngine := ctx.QueryInt("SrcEngine")
- InputShape := ctx.Query("inputshape")
- InputDataFormat := ctx.Query("inputdataformat")
- DestFormat := ctx.QueryInt("DestFormat")
- NetOutputFormat := ctx.QueryInt("NetOutputFormat")
-
- task, err := models.QueryModelById(modelId)
- if err != nil {
- log.Error("no such model!", err.Error())
- ctx.ServerError("no such model:", err)
- return
- }
-
- uuid := uuid.NewV4()
- id := uuid.String()
- modelConvert := &models.AiModelConvert{
- ID: id,
- Name: name,
- Description: desc,
- Status: string(models.JobWaiting),
- SrcEngine: SrcEngine,
- RepoId: ctx.Repo.Repository.ID,
- ModelName: task.Name,
- ModelVersion: task.Version,
- ModelId: modelId,
- ModelPath: modelPath,
- DestFormat: DestFormat,
- NetOutputFormat: NetOutputFormat,
- InputShape: InputShape,
- InputDataFormat: InputDataFormat,
- UserId: ctx.User.ID,
- }
- models.SaveModelConvert(modelConvert)
- err = createTrainJob(modelConvert, ctx, task.Path)
- if err == nil {
- ctx.JSON(200, map[string]string{
- "result_code": "0",
- })
- } else {
- ctx.JSON(200, map[string]string{
- "result_code": "1",
- })
- }
- }
-
- func createTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error {
- repo, _ := models.GetRepositoryByID(ctx.Repo.Repository.ID)
- if modelConvert.SrcEngine == PYTORCH_ENGINE {
-
- codePath := setting.JobPath + modelConvert.ID + CodeMountPath
-
- downloadCode(repo, codePath, DefaultBranchName)
-
- uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/")
- log.Info("minio code path=" + setting.CBCodePathPrefix + modelConvert.ID)
-
- modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/"
- mkModelPath(modelPath)
- uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/")
- command := getModelConvertCommand(modelConvert.ID, modelConvert.ModelPath)
- log.Info("command=" + command)
- dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath
-
- if TrainResourceSpecs == nil {
- json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
- }
- resourceSpec := TrainResourceSpecs.ResourceSpec[1]
- jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{
- JobName: modelConvert.ID,
- RetryCount: 1,
- GpuType: GpuQueue,
- Image: GPU_PYTORCH_IMAGE,
- TaskRoles: []models.TaskRole{
- {
- Name: SubTaskName,
- TaskNumber: 1,
- MinSucceededTaskCount: 1,
- MinFailedTaskCount: 1,
- CPUNumber: resourceSpec.CpuNum,
- GPUNumber: resourceSpec.GpuNum,
- MemoryMB: resourceSpec.MemMiB,
- ShmMB: resourceSpec.ShareMemMiB,
- Command: command,
- NeedIBDevice: false,
- IsMainRole: false,
- UseNNI: false,
- },
- },
- Volumes: []models.Volume{
- {
- HostPath: models.StHostPath{
- Path: codePath,
- MountPath: CodeMountPath,
- ReadOnly: false,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: dataActualPath,
- MountPath: DataSetMountPath,
- ReadOnly: true,
- },
- },
- {
- HostPath: models.StHostPath{
- Path: modelPath,
- MountPath: ModelMountPath,
- ReadOnly: false,
- },
- },
- },
- })
- if err != nil {
- log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
- return err
- }
- if jobResult.Code != Success {
- log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"])
- return errors.New(jobResult.Msg)
- }
-
- var jobID = jobResult.Payload["jobId"].(string)
- log.Info("jobId=" + jobID)
- models.UpdateModelConvertCBTI(modelConvert.ID, jobID)
- }
-
- return nil
- }
-
- func getModelConvertCommand(name string, modelFile string) string {
- var command string
- bootFile := "convert_pytorch.py"
- command += "python3 /code/" + bootFile + " --model " + modelFile + " > " + ModelMountPath + "/" + name + "-" + LogFile
- return command
- }
-
- func DeleteModelConvert(ctx *context.Context) {
- log.Info("delete model convert start.")
- id := ctx.Params(":id")
- err := models.DeleteModelConvertById(id)
- if err != nil {
- ctx.JSON(500, err.Error())
- } else {
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
- }
- }
-
- func StopModelConvert(ctx *context.Context) {
- id := ctx.Params(":id")
- log.Info("stop model convert start.id=" + id)
- }
-
- func ShowModelConvertInfo(ctx *context.Context) {
- ctx.Data["ID"] = ctx.Query("ID")
- ctx.Data["isModelManage"] = true
- ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
-
- job, err := models.QueryModelConvertById(ctx.Query("ID"))
- if err == nil {
- ctx.Data["task"] = job
- }
- result, err := cloudbrain.GetJob(job.CloudBrainTaskId)
- if err != nil {
- log.Info("error:" + err.Error())
- ctx.Data["error"] = err.Error()
- return
- }
- if result != nil {
- jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
- ctx.Data["result"] = jobRes
- taskRoles := jobRes.TaskRoles
- taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
- ctx.Data["taskRes"] = taskRes
- ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics
-
- job.Status = jobRes.JobStatus.State
-
- if jobRes.JobStatus.State != string(models.JobWaiting) && jobRes.JobStatus.State != string(models.JobFailed) {
- job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
- job.ContainerID = taskRes.TaskStatuses[0].ContainerID
- job.Status = taskRes.TaskStatuses[0].State
- }
- if jobRes.JobStatus.State != string(models.JobWaiting) {
- models.ModelComputeAndSetDuration(job, jobRes)
- err = models.UpdateModelConvert(job)
- if err != nil {
- log.Error("UpdateModelConvert failed:", err)
- }
- }
- }
- ctx.HTML(200, tplModelConvertInfo)
- }
-
- func ConvertModelTemplate(ctx *context.Context) {
- ctx.Data["isModelManage"] = true
- ctx.Data["MODEL_COUNT"] = 0
- ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
- ctx.Data["TRAIN_COUNT"] = 0
- ShowModelConvertPageInfo(ctx)
- ctx.HTML(200, tplModelManageConvertIndex)
- }
-
- func ShowModelConvertPageInfo(ctx *context.Context) {
- log.Info("ShowModelConvertInfo start.")
- if !isQueryRight(ctx) {
- log.Info("no right.")
- ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
- return
- }
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
- pageSize := ctx.QueryInt("pageSize")
- if pageSize <= 0 {
- pageSize = setting.UI.IssuePagingNum
- }
- repoId := ctx.Repo.Repository.ID
- modelResult, count, err := models.QueryModelConvert(&models.AiModelQueryOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: pageSize,
- },
- RepoID: repoId,
- })
- if err != nil {
- log.Info("query db error." + err.Error())
- ctx.ServerError("Cloudbrain", err)
- return
- }
- userIds := make([]int64, len(modelResult))
- for i, model := range modelResult {
- model.IsCanOper = isOper(ctx, model.UserId)
- model.IsCanDelete = isCanDelete(ctx, model.UserId)
- userIds[i] = model.UserId
- }
- userNameMap := queryUserName(userIds)
- for _, model := range modelResult {
- value := userNameMap[model.UserId]
- if value != nil {
- model.UserName = value.Name
- model.UserRelAvatarLink = value.RelAvatarLink()
- }
- }
- pager := context.NewPagination(int(count), page, pageSize, 5)
- ctx.Data["Page"] = pager
- ctx.Data["Tasks"] = modelResult
-
- }
|