Browse Source

提交代码。

Signed-off-by: zouap <zouap@pcl.ac.cn>
tags/v1.22.7.1
zouap 3 years ago
parent
commit
053920309b
3 changed files with 188 additions and 21 deletions
  1. +6
    -6
      models/cloudbrain.go
  2. +175
    -8
      modules/modelarts/modelarts.go
  3. +7
    -7
      routers/repo/ai_model_convert.go

+ 6
- 6
models/cloudbrain.go View File

@@ -164,12 +164,12 @@ type Cloudbrain struct {
FlavorName string //规格名称
EngineName string //引擎名称
TotalVersionCount int //任务的所有版本数量,包括删除的
LabelName string //标签名称
ModelName string //模型名称
ModelVersion string //模型版本
CkptName string //权重文件名称
ResultUrl string //推理结果的obs路径
UserImageCommand string `xorm:"varchar(1000)"` //描述
LabelName string //标签名称
ModelName string //模型名称
ModelVersion string //模型版本
CkptName string //权重文件名称
ResultUrl string //推理结果的obs路径

User *User `xorm:"-"`
Repo *Repository `xorm:"-"`


+ 175
- 8
modules/modelarts/modelarts.go View File

@@ -19,14 +19,14 @@ import (

const (
//notebook
storageTypeOBS = "obs"
autoStopDuration = 4 * 60 * 60
autoStopDurationMs = 4 * 60 * 60 * 1000
DataSetMountPath = "/home/ma-user/work"
NotebookEnv = "Python3"
NotebookType = "Ascend"
FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
storageTypeOBS = "obs"
autoStopDuration = 4 * 60 * 60
autoStopDurationMs = 4 * 60 * 60 * 1000
MORDELART_USER_IMAGE_ENGINE_ID = 1000000
DataSetMountPath = "/home/ma-user/work"
NotebookEnv = "Python3"
NotebookType = "Ascend"
FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"

//train-job
// ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
@@ -152,6 +152,7 @@ type Engine struct {
Info []struct {
ID int `json:"id"`
Value string `json:"value"`
Url string `json:"url"`
} `json:"engine"`
}

@@ -314,6 +315,78 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
return nil
}

func GenerateTrainJobByUserImage(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}

jobId := strconv.FormatInt(jobResult.JobID, 10)
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobId,
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: models.NPUResource,
EngineID: MORDELART_USER_IMAGE_ENGINE_ID,
Image: req.UserImageUrl,
UserImageCommand: req.UserCommand,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
EngineName: req.EngineName,
VersionCount: req.VersionCount,
TotalVersionCount: req.TotalVersionCount,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})

if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
return err
}
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
return nil
}

func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJob(models.CreateTrainJobParams{
@@ -497,6 +570,100 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
return err
}

func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}

var jobTypes []string
jobTypes = append(jobTypes, string(models.JobTypeTrain))
repo := ctx.Repo.Repository
VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes,
JobID: strconv.FormatInt(jobResult.JobID, 10),
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return err
}
//将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
PreVersionName: req.PreVersionName,
ComputeResource: models.NPUResource,
EngineID: MORDELART_USER_IMAGE_ENGINE_ID,
Image: req.UserImageUrl,
UserImageCommand: req.UserCommand,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
PreVersionId: req.PreVersionId,
FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
EngineName: req.EngineName,
TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
VersionCount: VersionListCount + 1,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
}

//将训练任务的上一版本的isLatestVersion设置为"0"
err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
if err != nil {
ctx.ServerError("Update IsLatestVersion failed", err)
return err
}

return err
}

func TransTrainJobStatus(status int) string {
switch status {
case 0:


+ 7
- 7
routers/repo/ai_model_convert.go View File

@@ -39,12 +39,12 @@ const (
Success = "S000"
GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap"
GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx"
PytorchOnnxBootFile = "convert_pytorch.py"
PytorchTrTBootFile = "convert_pytorch_tensorrt.py"
MindsporeBootFile = "convert_mindspore.py"
TensorFlowNpuBootFile = "convert_tensorflow.py"
TensorFlowGpuBootFile = "convert_tensorflow_gpu.py"
NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend"
PytorchOnnxBootFile = "convert_pytorch.py"
PytorchTrTBootFile = "convert_pytorch_tensorrt.py"
MindsporeBootFile = "convert_mindspore.py"
TensorFlowNpuBootFile = "convert_tensorflow.py"
TensorFlowGpuBootFile = "convert_tensorflow_gpu.py"

ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test"
REPO_ID = 33267
@@ -227,7 +227,7 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context
PoolID: NPU_PoolID,
//Parameters: param,
BranchName: DefaultBranchName,
UserImageUrl: "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend",
UserImageUrl: NPU_MINDSPORE_16_IMAGE,
UserCommand: userCommand,
}
result, err := modelarts.GenerateModelConvertTrainJob(req)


Loading…
Cancel
Save