You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "strconv"
  11. "strings"
  12. "time"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/aisafety"
  15. "code.gitea.io/gitea/modules/cloudbrain"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/setting"
  21. "code.gitea.io/gitea/modules/storage"
  22. "code.gitea.io/gitea/modules/util"
  23. "code.gitea.io/gitea/services/cloudbrain/resource"
  24. "code.gitea.io/gitea/services/reward/point/account"
  25. uuid "github.com/satori/go.uuid"
  26. )
  27. const (
  28. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  29. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  30. )
  31. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  32. log.Info("start to create CloudBrainAiSafetyCreate")
  33. uuid := uuid.NewV4()
  34. id := uuid.String()
  35. seriaNoParas := ctx.Query("serialNo")
  36. fileName := ctx.Query("fileName")
  37. //if jobType == string(models.JobTypeBenchmark) {
  38. req := aisafety.TaskReq{
  39. UnionId: id,
  40. EvalName: "test1",
  41. EvalContent: "test1",
  42. TLPath: "test1",
  43. Indicators: []string{"ACC", "ASS"},
  44. CDName: "CIFAR10_1000_FGSM",
  45. BDName: "CIFAR10_1000基础数据集",
  46. }
  47. aisafety.GetAlgorithmList()
  48. if seriaNoParas != "" {
  49. aisafety.GetTaskStatus(seriaNoParas)
  50. } else {
  51. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  52. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  53. if err == nil {
  54. log.Info("serialNo=" + serialNo)
  55. time.Sleep(time.Duration(2) * time.Second)
  56. aisafety.GetTaskStatus(serialNo)
  57. } else {
  58. log.Info("CreateSafetyTask error," + err.Error())
  59. }
  60. }
  61. }
  62. func GetAiSafetyTask(ctx *context.Context) {
  63. var ID = ctx.Params(":jobid")
  64. job, err := models.GetCloudbrainByJobIDWithDeleted(ID)
  65. if err != nil {
  66. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  67. return
  68. }
  69. syncAiSafetyTaskStatus(job)
  70. }
  71. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  72. if isTaskNotFinished(job.Status) {
  73. if job.Type == models.TypeCloudBrainTwo {
  74. queryTaskStatusFromCloudbrainTwo(job)
  75. } else if job.Type == models.TypeCloudBrainOne {
  76. queryTaskStatusFromCloudbrain(job)
  77. }
  78. } else {
  79. if job.Status == string(models.ModelSafetyTesting) {
  80. queryTaskStatusFromModelSafetyTestServer(job)
  81. } else {
  82. log.Info("The job is finished. status=" + job.Status)
  83. }
  84. }
  85. }
  86. func TimerHandleModelSafetyTestTask() {
  87. tasks, err := models.GetModelSafetyTestTask()
  88. if err == nil {
  89. if tasks != nil && len(tasks) > 0 {
  90. for _, job := range tasks {
  91. syncAiSafetyTaskStatus(job)
  92. }
  93. } else {
  94. log.Info("query running model safety test task 0.")
  95. }
  96. } else {
  97. log.Info("query running model safety test task err." + err.Error())
  98. }
  99. }
  100. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  101. log.Info("The task not finished,name=" + job.DisplayJobName)
  102. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  103. if err != nil {
  104. log.Info("query train job error." + err.Error())
  105. return
  106. }
  107. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  108. job.Duration = result.Duration
  109. job.TrainJobDuration = result.TrainJobDuration
  110. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  111. err = models.UpdateJob(job)
  112. if err != nil {
  113. log.Error("UpdateJob failed:", err)
  114. }
  115. } else {
  116. job.Status = string(models.ModelSafetyTesting)
  117. err = models.UpdateJob(job)
  118. if err != nil {
  119. log.Error("UpdateJob failed:", err)
  120. }
  121. //send msg to beihang
  122. sendNPUInferenceResultToTest(job)
  123. }
  124. }
  125. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  126. datasetname := job.DatasetName
  127. datasetnames := strings.Split(datasetname, ";")
  128. indicator := job.LabelName
  129. req := aisafety.TaskReq{
  130. UnionId: job.JobID,
  131. EvalName: job.DisplayJobName,
  132. EvalContent: job.Description,
  133. TLPath: "test",
  134. Indicators: strings.Split(indicator, ";"),
  135. CDName: datasetnames[1],
  136. BDName: datasetnames[0],
  137. }
  138. jsonContent := ""
  139. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  140. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  141. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  142. if err != nil {
  143. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  144. } else {
  145. defer body.Close()
  146. var data []byte
  147. p := make([]byte, 4096)
  148. var readErr error
  149. var readCount int
  150. for {
  151. readCount, readErr = body.Read(p)
  152. if readCount > 0 {
  153. data = append(data, p[:readCount]...)
  154. }
  155. if readErr != nil || readCount == 0 {
  156. break
  157. }
  158. }
  159. jsonContent = string(data)
  160. }
  161. if jsonContent != "" {
  162. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  163. if err == nil {
  164. //update serial no to db
  165. job.PreVersionName = serialNo
  166. err = models.UpdateJob(job)
  167. if err != nil {
  168. log.Error("UpdateJob failed:", err)
  169. }
  170. }
  171. } else {
  172. log.Info("The json is null. so set it failed.")
  173. //update task failed.
  174. job.Status = string(models.ModelArtsTrainJobFailed)
  175. err := models.UpdateJob(job)
  176. if err != nil {
  177. log.Error("UpdateJob failed:", err)
  178. }
  179. }
  180. }
  181. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  182. log.Info("The task not finished,name=" + job.DisplayJobName)
  183. jobResult, err := cloudbrain.GetJob(job.JobID)
  184. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  185. if err != nil {
  186. log.Error("ConvertToJobResultPayload failed:", err)
  187. return
  188. }
  189. job.Status = result.JobStatus.State
  190. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  191. taskRoles := result.TaskRoles
  192. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  193. job.Status = taskRes.TaskStatuses[0].State
  194. }
  195. if result.JobStatus.State != string(models.JobSucceeded) {
  196. err = models.UpdateJob(job)
  197. if err != nil {
  198. log.Error("UpdateJob failed:", err)
  199. }
  200. } else {
  201. //
  202. job.Status = string(models.ModelSafetyTesting)
  203. err = models.UpdateJob(job)
  204. if err != nil {
  205. log.Error("UpdateJob failed:", err)
  206. }
  207. //send msg to beihang
  208. sendGPUInferenceResultToTest(job)
  209. }
  210. }
  211. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  212. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  213. if err == nil {
  214. if result.Code == "0" {
  215. if result.Data.Status == 1 {
  216. log.Info("The task is running....")
  217. } else {
  218. if result.Data.Code == 0 {
  219. job.ResultJson = result.Data.StandardJson
  220. err = models.UpdateJob(job)
  221. if err != nil {
  222. log.Error("UpdateJob failed:", err)
  223. }
  224. }
  225. }
  226. } else {
  227. log.Info("The task is failed.")
  228. job.Status = string(models.JobFailed)
  229. err = models.UpdateJob(job)
  230. if err != nil {
  231. log.Error("UpdateJob failed:", err)
  232. }
  233. }
  234. } else {
  235. log.Info("The task not found.....")
  236. }
  237. }
  238. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  239. datasetname := job.DatasetName
  240. datasetnames := strings.Split(datasetname, ";")
  241. indicator := job.LabelName
  242. req := aisafety.TaskReq{
  243. UnionId: job.JobID,
  244. EvalName: job.DisplayJobName,
  245. EvalContent: job.Description,
  246. TLPath: "test",
  247. Indicators: strings.Split(indicator, ";"),
  248. CDName: datasetnames[1],
  249. BDName: datasetnames[0],
  250. }
  251. resultDir := "/model"
  252. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  253. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  254. if err != nil {
  255. log.Error("query cloudbrain one model failed: %v", err)
  256. return
  257. }
  258. jsonContent := ""
  259. for _, file := range files {
  260. if strings.HasSuffix(file.FileName, "result.json") {
  261. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  262. log.Info("path=" + path)
  263. reader, err := os.Open(path)
  264. defer reader.Close()
  265. if err == nil {
  266. r := bufio.NewReader(reader)
  267. for {
  268. line, error := r.ReadString('\n')
  269. if error == io.EOF {
  270. log.Info("read file completed.")
  271. break
  272. }
  273. if error != nil {
  274. log.Info("read file error." + error.Error())
  275. break
  276. }
  277. jsonContent += line
  278. }
  279. }
  280. break
  281. }
  282. }
  283. if jsonContent != "" {
  284. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  285. if err == nil {
  286. //update serial no to db
  287. job.PreVersionName = serialNo
  288. err = models.UpdateJob(job)
  289. if err != nil {
  290. log.Error("UpdateJob failed:", err)
  291. }
  292. }
  293. } else {
  294. log.Info("The json is null. so set it failed.")
  295. //update task failed.
  296. job.Status = string(models.JobFailed)
  297. err = models.UpdateJob(job)
  298. if err != nil {
  299. log.Error("UpdateJob failed:", err)
  300. }
  301. }
  302. }
  303. func isTaskNotFinished(status string) bool {
  304. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  305. return true
  306. }
  307. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  308. return true
  309. }
  310. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  311. return true
  312. }
  313. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  314. return true
  315. }
  316. return false
  317. }
  318. func StopAiSafetyTask(ctx *context.Context) {
  319. }
  320. func DelAiSafetyTask(ctx *context.Context) {
  321. }
  322. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  323. t := time.Now()
  324. ctx.Data["PageIsCloudBrain"] = true
  325. ctx.Data["IsCreate"] = true
  326. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  327. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  328. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  329. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  330. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  331. ctx.Data["display_job_name"] = displayJobName
  332. prepareCloudbrainOneSpecs(ctx)
  333. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  334. if queuesDetail != nil {
  335. ctx.Data["QueuesDetail"] = queuesDetail
  336. }
  337. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  338. }
  339. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  340. t := time.Now()
  341. ctx.Data["PageIsCloudBrain"] = true
  342. ctx.Data["IsCreate"] = true
  343. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  344. ctx.Data["display_job_name"] = displayJobName
  345. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  346. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  347. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  348. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  349. var resourcePools modelarts.ResourcePool
  350. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  351. ctx.ServerError("json.Unmarshal failed:", err)
  352. }
  353. ctx.Data["resource_pools"] = resourcePools.Info
  354. var engines modelarts.Engine
  355. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  356. ctx.ServerError("json.Unmarshal failed:", err)
  357. }
  358. ctx.Data["engines"] = engines.Info
  359. var versionInfos modelarts.VersionInfo
  360. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  361. ctx.ServerError("json.Unmarshal failed:", err)
  362. }
  363. ctx.Data["engine_versions"] = versionInfos.Version
  364. prepareCloudbrainTwoInferenceSpecs(ctx)
  365. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  366. ctx.Data["WaitCount"] = waitCount
  367. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  368. }
  369. func AiSafetyCreateForPost(ctx *context.Context) {
  370. ctx.Data["PageIsCloudBrain"] = true
  371. displayJobName := ctx.Query("DisplayJobName")
  372. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  373. taskType := ctx.QueryInt("type")
  374. description := ctx.Query("Description")
  375. ctx.Data["description"] = description
  376. repo := ctx.Repo.Repository
  377. tpname := tplCloudBrainModelSafetyNewNpu
  378. if taskType == models.TypeCloudBrainOne {
  379. tpname = tplCloudBrainModelSafetyNewGpu
  380. }
  381. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  382. if err == nil {
  383. if len(tasks) != 0 {
  384. log.Error("the job name did already exist", ctx.Data["MsgID"])
  385. modelSafetyNewDataPrepare(ctx)
  386. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  387. return
  388. }
  389. } else {
  390. if !models.IsErrJobNotExist(err) {
  391. log.Error("system error, %v", err, ctx.Data["MsgID"])
  392. modelSafetyNewDataPrepare(ctx)
  393. ctx.RenderWithErr("system error", tpname, nil)
  394. return
  395. }
  396. }
  397. if !jobNamePattern.MatchString(jobName) {
  398. modelSafetyNewDataPrepare(ctx)
  399. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  400. return
  401. }
  402. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  403. if err != nil {
  404. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  405. modelSafetyNewDataPrepare(ctx)
  406. ctx.RenderWithErr("system error", tpname, nil)
  407. return
  408. } else {
  409. if count >= 1 {
  410. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  411. modelSafetyNewDataPrepare(ctx)
  412. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  413. return
  414. }
  415. }
  416. BootFile := ctx.Query("BootFile")
  417. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  418. if err != nil || !bootFileExist {
  419. log.Error("Get bootfile error:", err)
  420. modelSafetyNewDataPrepare(ctx)
  421. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  422. return
  423. }
  424. if taskType == models.TypeCloudBrainTwo {
  425. createForNPU(ctx, jobName)
  426. } else if taskType == models.TypeCloudBrainOne {
  427. createForGPU(ctx, jobName)
  428. }
  429. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  430. }
  431. func createForNPU(ctx *context.Context, jobName string) {
  432. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  433. BootFile := ctx.Query("BootFile")
  434. displayJobName := ctx.Query("DisplayJobName")
  435. description := ctx.Query("Description")
  436. engineID := ctx.QueryInt("EngineID")
  437. poolID := ctx.Query("PoolID")
  438. //image := strings.TrimSpace(ctx.Query("Image"))
  439. srcDataset := ctx.Query("srcDataset") //uuid
  440. combatDataset := ctx.Query("combatDataset") //uuid
  441. evaluationIndex := ctx.Query("evaluationIndex")
  442. Params := ctx.Query("RunParaList")
  443. specId := ctx.QueryInt64("SpecId")
  444. repo := ctx.Repo.Repository
  445. trainUrl := ctx.Query("TrainUrl")
  446. modelName := ctx.Query("ModelName")
  447. modelVersion := ctx.Query("ModelVersion")
  448. ckptName := ctx.Query("CkptName")
  449. ckptUrl := "/" + trainUrl + ckptName
  450. log.Info("ckpt url:" + ckptUrl)
  451. FlavorName := ctx.Query("FlavorName")
  452. EngineName := ctx.Query("EngineName")
  453. isLatestVersion := modelarts.IsLatestVersion
  454. VersionCount := modelarts.VersionCountOne
  455. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  456. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  457. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  458. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  459. log.Info("ckpt url:" + ckptUrl)
  460. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  461. JobType: models.JobTypeInference,
  462. ComputeResource: models.NPU,
  463. Cluster: models.OpenICluster,
  464. AiCenterCode: models.AICenterOfCloudBrainTwo})
  465. if err != nil || spec == nil {
  466. modelSafetyNewDataPrepare(ctx)
  467. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  468. return
  469. }
  470. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  471. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  472. modelSafetyNewDataPrepare(ctx)
  473. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  474. return
  475. }
  476. //todo: del the codeLocalPath
  477. _, err = ioutil.ReadDir(codeLocalPath)
  478. if err == nil {
  479. os.RemoveAll(codeLocalPath)
  480. }
  481. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  482. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  483. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  484. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  485. modelSafetyNewDataPrepare(ctx)
  486. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  487. return
  488. }
  489. //todo: upload code (send to file_server todo this work?)
  490. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  491. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  492. modelSafetyNewDataPrepare(ctx)
  493. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  494. return
  495. }
  496. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  497. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  498. modelSafetyNewDataPrepare(ctx)
  499. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  500. return
  501. }
  502. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  503. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  504. modelSafetyNewDataPrepare(ctx)
  505. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  506. return
  507. }
  508. var parameters models.Parameters
  509. param := make([]models.Parameter, 0)
  510. param = append(param, models.Parameter{
  511. Label: modelarts.ResultUrl,
  512. Value: "s3:/" + resultObsPath,
  513. }, models.Parameter{
  514. Label: modelarts.CkptUrl,
  515. Value: "s3:/" + ckptUrl,
  516. })
  517. uuid := srcDataset + ";" + combatDataset
  518. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  519. if err != nil {
  520. modelSafetyNewDataPrepare(ctx)
  521. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  522. return
  523. }
  524. dataPath := dataUrl
  525. jsondatas, err := json.Marshal(datasUrlList)
  526. if err != nil {
  527. log.Error("Failed to Marshal: %v", err)
  528. modelSafetyNewDataPrepare(ctx)
  529. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  530. return
  531. }
  532. if isMultiDataset {
  533. param = append(param, models.Parameter{
  534. Label: modelarts.MultiDataUrl,
  535. Value: string(jsondatas),
  536. })
  537. }
  538. existDeviceTarget := false
  539. if len(Params) != 0 {
  540. err := json.Unmarshal([]byte(Params), &parameters)
  541. if err != nil {
  542. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  543. modelSafetyNewDataPrepare(ctx)
  544. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  545. return
  546. }
  547. for _, parameter := range parameters.Parameter {
  548. if parameter.Label == modelarts.DeviceTarget {
  549. existDeviceTarget = true
  550. }
  551. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  552. param = append(param, models.Parameter{
  553. Label: parameter.Label,
  554. Value: parameter.Value,
  555. })
  556. }
  557. }
  558. }
  559. if !existDeviceTarget {
  560. param = append(param, models.Parameter{
  561. Label: modelarts.DeviceTarget,
  562. Value: modelarts.Ascend,
  563. })
  564. }
  565. req := &modelarts.GenerateInferenceJobReq{
  566. JobName: jobName,
  567. DisplayJobName: displayJobName,
  568. DataUrl: dataPath,
  569. Description: description,
  570. CodeObsPath: codeObsPath,
  571. BootFileUrl: codeObsPath + BootFile,
  572. BootFile: BootFile,
  573. TrainUrl: trainUrl,
  574. WorkServerNumber: 1,
  575. EngineID: int64(engineID),
  576. LogUrl: logObsPath,
  577. PoolID: poolID,
  578. Uuid: uuid,
  579. Parameters: param, //modelarts train parameters
  580. CommitID: commitID,
  581. BranchName: cloudbrain.DefaultBranchName,
  582. Params: Params,
  583. FlavorName: FlavorName,
  584. EngineName: EngineName,
  585. LabelName: evaluationIndex,
  586. IsLatestVersion: isLatestVersion,
  587. VersionCount: VersionCount,
  588. TotalVersionCount: modelarts.TotalVersionCount,
  589. ModelName: modelName,
  590. ModelVersion: modelVersion,
  591. CkptName: ckptName,
  592. ResultUrl: resultObsPath,
  593. Spec: spec,
  594. DatasetName: datasetNames,
  595. JobType: string(models.JobTypeModelSafety),
  596. }
  597. err = modelarts.GenerateInferenceJob(ctx, req)
  598. if err != nil {
  599. log.Error("GenerateTrainJob failed:%v", err.Error())
  600. modelSafetyNewDataPrepare(ctx)
  601. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  602. return
  603. }
  604. }
  605. func createForGPU(ctx *context.Context, jobName string) {
  606. BootFile := ctx.Query("BootFile")
  607. displayJobName := ctx.Query("DisplayJobName")
  608. description := ctx.Query("Description")
  609. image := strings.TrimSpace(ctx.Query("Image"))
  610. srcDataset := ctx.Query("srcDataset") //uuid
  611. combatDataset := ctx.Query("combatDataset") //uuid
  612. evaluationIndex := ctx.Query("evaluationIndex")
  613. Params := ctx.Query("RunParaList")
  614. specId := ctx.QueryInt64("SpecId")
  615. TrainUrl := ctx.Query("TrainUrl")
  616. CkptName := ctx.Query("CkptName")
  617. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  618. log.Info("ckpt url:" + ckptUrl)
  619. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  620. JobType: models.JobTypeBenchmark,
  621. ComputeResource: models.GPU,
  622. Cluster: models.OpenICluster,
  623. AiCenterCode: models.AICenterOfCloudBrainOne})
  624. if err != nil || spec == nil {
  625. modelSafetyNewDataPrepare(ctx)
  626. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  627. return
  628. }
  629. repo := ctx.Repo.Repository
  630. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  631. os.RemoveAll(codePath)
  632. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  633. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  634. modelSafetyNewDataPrepare(ctx)
  635. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  636. return
  637. }
  638. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  639. if err != nil {
  640. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  641. modelSafetyNewDataPrepare(ctx)
  642. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  643. return
  644. }
  645. uuid := srcDataset + ";" + combatDataset
  646. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  647. if err != nil {
  648. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  649. modelSafetyNewDataPrepare(ctx)
  650. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  651. return
  652. }
  653. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  654. if err != nil {
  655. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  656. modelSafetyNewDataPrepare(ctx)
  657. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  658. return
  659. }
  660. log.Info("Command=" + command)
  661. req := cloudbrain.GenerateCloudBrainTaskReq{
  662. Ctx: ctx,
  663. DisplayJobName: displayJobName,
  664. JobName: jobName,
  665. Image: image,
  666. Command: command,
  667. Uuids: uuid,
  668. DatasetNames: datasetNames,
  669. DatasetInfos: datasetInfos,
  670. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  671. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  672. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  673. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  674. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  675. JobType: string(models.JobTypeModelSafety),
  676. Description: description,
  677. BranchName: cloudbrain.DefaultBranchName,
  678. BootFile: BootFile,
  679. Params: Params,
  680. CommitID: "",
  681. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  682. Spec: spec,
  683. LabelName: evaluationIndex,
  684. }
  685. err = cloudbrain.GenerateTask(req)
  686. if err != nil {
  687. modelSafetyNewDataPrepare(ctx)
  688. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  689. return
  690. }
  691. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  692. }
  693. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  694. var command string
  695. bootFile := strings.TrimSpace(BootFile)
  696. if !strings.HasSuffix(bootFile, ".py") {
  697. log.Error("bootFile(%s) format error", bootFile)
  698. return command, errors.New("bootFile format error")
  699. }
  700. var parameters models.Parameters
  701. var param string
  702. if len(params) != 0 {
  703. err := json.Unmarshal([]byte(params), &parameters)
  704. if err != nil {
  705. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  706. return command, err
  707. }
  708. for _, parameter := range parameters.Parameter {
  709. param += " --" + parameter.Label + "=" + parameter.Value
  710. }
  711. }
  712. param += " --modelname" + "=" + CkptName
  713. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  714. return command, nil
  715. }
  716. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  717. ctx.Data["PageIsCloudBrain"] = true
  718. t := time.Now()
  719. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  720. ctx.Data["display_job_name"] = displayJobName
  721. ctx.Data["command"] = cloudbrain.GetCloudbrainDebugCommand()
  722. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  723. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  724. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  725. ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath
  726. ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled
  727. if categories == nil {
  728. json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
  729. }
  730. ctx.Data["benchmark_categories"] = categories.Category
  731. ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
  732. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  733. if queuesDetail != nil {
  734. ctx.Data["QueuesDetail"] = queuesDetail
  735. }
  736. prepareCloudbrainOneSpecs(ctx)
  737. ctx.Data["params"] = ""
  738. ctx.Data["branchName"] = ctx.Repo.BranchName
  739. ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
  740. ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled
  741. ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
  742. ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled
  743. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  744. ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
  745. return nil
  746. }
  747. func getJsonContent(url string) (string, error) {
  748. resp, err := http.Get(url)
  749. if err != nil || resp.StatusCode != 200 {
  750. log.Info("Get organizations url error=" + err.Error())
  751. return "", err
  752. }
  753. bytes, err := ioutil.ReadAll(resp.Body)
  754. resp.Body.Close()
  755. if err != nil {
  756. log.Info("Get organizations url error=" + err.Error())
  757. return "", err
  758. }
  759. str := string(bytes)
  760. //log.Info("json str =" + str)
  761. return str, nil
  762. }