You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "strconv"
  11. "strings"
  12. "time"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/aisafety"
  15. "code.gitea.io/gitea/modules/cloudbrain"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/setting"
  21. "code.gitea.io/gitea/modules/storage"
  22. "code.gitea.io/gitea/modules/util"
  23. "code.gitea.io/gitea/services/cloudbrain/resource"
  24. "code.gitea.io/gitea/services/reward/point/account"
  25. uuid "github.com/satori/go.uuid"
  26. )
  27. const (
  28. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  29. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  30. )
  31. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  32. log.Info("start to create CloudBrainAiSafetyCreate")
  33. uuid := uuid.NewV4()
  34. id := uuid.String()
  35. seriaNoParas := ctx.Query("serialNo")
  36. fileName := ctx.Query("fileName")
  37. //if jobType == string(models.JobTypeBenchmark) {
  38. req := aisafety.TaskReq{
  39. UnionId: id,
  40. EvalName: "test1",
  41. EvalContent: "test1",
  42. TLPath: "test1",
  43. Indicators: []string{"ACC", "ASS"},
  44. CDName: "CIFAR10_1000_FGSM",
  45. BDName: "CIFAR10_1000基础数据集",
  46. }
  47. aisafety.GetAlgorithmList()
  48. if seriaNoParas != "" {
  49. aisafety.GetTaskStatus(seriaNoParas)
  50. } else {
  51. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  52. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  53. if err == nil {
  54. log.Info("serialNo=" + serialNo)
  55. time.Sleep(time.Duration(2) * time.Second)
  56. aisafety.GetTaskStatus(serialNo)
  57. } else {
  58. log.Info("CreateSafetyTask error," + err.Error())
  59. }
  60. }
  61. }
  62. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  63. if job == nil {
  64. log.Error("GetCloudbrainByJobID failed")
  65. return
  66. }
  67. syncAiSafetyTaskStatus(job)
  68. }
  69. func GetAiSafetyTask(ctx *context.Context) {
  70. var ID = ctx.Params(":jobid")
  71. job, err := models.GetCloudbrainByJobIDWithDeleted(ID)
  72. if err != nil {
  73. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  74. return
  75. }
  76. syncAiSafetyTaskStatus(job)
  77. }
  78. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  79. if isTaskNotFinished(job.Status) {
  80. if job.Type == models.TypeCloudBrainTwo {
  81. queryTaskStatusFromCloudbrainTwo(job)
  82. } else if job.Type == models.TypeCloudBrainOne {
  83. queryTaskStatusFromCloudbrain(job)
  84. }
  85. } else {
  86. if job.Status == string(models.ModelSafetyTesting) {
  87. queryTaskStatusFromModelSafetyTestServer(job)
  88. } else {
  89. log.Info("The job is finished. status=" + job.Status)
  90. }
  91. }
  92. }
  93. func TimerHandleModelSafetyTestTask() {
  94. tasks, err := models.GetModelSafetyTestTask()
  95. if err == nil {
  96. if tasks != nil && len(tasks) > 0 {
  97. for _, job := range tasks {
  98. syncAiSafetyTaskStatus(job)
  99. }
  100. } else {
  101. log.Info("query running model safety test task 0.")
  102. }
  103. } else {
  104. log.Info("query running model safety test task err." + err.Error())
  105. }
  106. }
  107. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  108. log.Info("The task not finished,name=" + job.DisplayJobName)
  109. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  110. if err != nil {
  111. log.Info("query train job error." + err.Error())
  112. return
  113. }
  114. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  115. job.Duration = result.Duration
  116. job.TrainJobDuration = result.TrainJobDuration
  117. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  118. err = models.UpdateJob(job)
  119. if err != nil {
  120. log.Error("UpdateJob failed:", err)
  121. }
  122. } else {
  123. job.Status = string(models.ModelSafetyTesting)
  124. err = models.UpdateJob(job)
  125. if err != nil {
  126. log.Error("UpdateJob failed:", err)
  127. }
  128. //send msg to beihang
  129. sendNPUInferenceResultToTest(job)
  130. }
  131. }
  132. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  133. datasetname := job.DatasetName
  134. datasetnames := strings.Split(datasetname, ";")
  135. indicator := job.LabelName
  136. req := aisafety.TaskReq{
  137. UnionId: job.JobID,
  138. EvalName: job.DisplayJobName,
  139. EvalContent: job.Description,
  140. TLPath: "test",
  141. Indicators: strings.Split(indicator, ";"),
  142. CDName: datasetnames[1],
  143. BDName: datasetnames[0],
  144. }
  145. jsonContent := ""
  146. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  147. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  148. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  149. if err != nil {
  150. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  151. } else {
  152. defer body.Close()
  153. var data []byte
  154. p := make([]byte, 4096)
  155. var readErr error
  156. var readCount int
  157. for {
  158. readCount, readErr = body.Read(p)
  159. if readCount > 0 {
  160. data = append(data, p[:readCount]...)
  161. }
  162. if readErr != nil || readCount == 0 {
  163. break
  164. }
  165. }
  166. jsonContent = string(data)
  167. }
  168. if jsonContent != "" {
  169. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  170. if err == nil {
  171. //update serial no to db
  172. job.PreVersionName = serialNo
  173. err = models.UpdateJob(job)
  174. if err != nil {
  175. log.Error("UpdateJob failed:", err)
  176. }
  177. }
  178. } else {
  179. log.Info("The json is null. so set it failed.")
  180. //update task failed.
  181. job.Status = string(models.ModelArtsTrainJobFailed)
  182. err := models.UpdateJob(job)
  183. if err != nil {
  184. log.Error("UpdateJob failed:", err)
  185. }
  186. }
  187. }
  188. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  189. log.Info("The task not finished,name=" + job.DisplayJobName)
  190. jobResult, err := cloudbrain.GetJob(job.JobID)
  191. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  192. if err != nil {
  193. log.Error("ConvertToJobResultPayload failed:", err)
  194. return
  195. }
  196. job.Status = result.JobStatus.State
  197. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  198. taskRoles := result.TaskRoles
  199. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  200. job.Status = taskRes.TaskStatuses[0].State
  201. }
  202. if result.JobStatus.State != string(models.JobSucceeded) {
  203. err = models.UpdateJob(job)
  204. if err != nil {
  205. log.Error("UpdateJob failed:", err)
  206. }
  207. } else {
  208. //
  209. job.Status = string(models.ModelSafetyTesting)
  210. err = models.UpdateJob(job)
  211. if err != nil {
  212. log.Error("UpdateJob failed:", err)
  213. }
  214. //send msg to beihang
  215. sendGPUInferenceResultToTest(job)
  216. }
  217. }
  218. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  219. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  220. if err == nil {
  221. if result.Code == "0" {
  222. if result.Data.Status == 1 {
  223. log.Info("The task is running....")
  224. } else {
  225. if result.Data.Code == 0 {
  226. job.ResultJson = result.Data.StandardJson
  227. err = models.UpdateJob(job)
  228. if err != nil {
  229. log.Error("UpdateJob failed:", err)
  230. }
  231. }
  232. }
  233. } else {
  234. log.Info("The task is failed.")
  235. job.Status = string(models.JobFailed)
  236. err = models.UpdateJob(job)
  237. if err != nil {
  238. log.Error("UpdateJob failed:", err)
  239. }
  240. }
  241. } else {
  242. log.Info("The task not found.....")
  243. }
  244. }
  245. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  246. datasetname := job.DatasetName
  247. datasetnames := strings.Split(datasetname, ";")
  248. indicator := job.LabelName
  249. req := aisafety.TaskReq{
  250. UnionId: job.JobID,
  251. EvalName: job.DisplayJobName,
  252. EvalContent: job.Description,
  253. TLPath: "test",
  254. Indicators: strings.Split(indicator, ";"),
  255. CDName: datasetnames[1],
  256. BDName: datasetnames[0],
  257. }
  258. resultDir := "/model"
  259. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  260. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  261. if err != nil {
  262. log.Error("query cloudbrain one model failed: %v", err)
  263. return
  264. }
  265. jsonContent := ""
  266. for _, file := range files {
  267. if strings.HasSuffix(file.FileName, "result.json") {
  268. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  269. log.Info("path=" + path)
  270. reader, err := os.Open(path)
  271. defer reader.Close()
  272. if err == nil {
  273. r := bufio.NewReader(reader)
  274. for {
  275. line, error := r.ReadString('\n')
  276. if error == io.EOF {
  277. log.Info("read file completed.")
  278. break
  279. }
  280. if error != nil {
  281. log.Info("read file error." + error.Error())
  282. break
  283. }
  284. jsonContent += line
  285. }
  286. }
  287. break
  288. }
  289. }
  290. if jsonContent != "" {
  291. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  292. if err == nil {
  293. //update serial no to db
  294. job.PreVersionName = serialNo
  295. err = models.UpdateJob(job)
  296. if err != nil {
  297. log.Error("UpdateJob failed:", err)
  298. }
  299. }
  300. } else {
  301. log.Info("The json is null. so set it failed.")
  302. //update task failed.
  303. job.Status = string(models.JobFailed)
  304. err = models.UpdateJob(job)
  305. if err != nil {
  306. log.Error("UpdateJob failed:", err)
  307. }
  308. }
  309. }
  310. func isTaskNotFinished(status string) bool {
  311. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  312. return true
  313. }
  314. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  315. return true
  316. }
  317. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  318. return true
  319. }
  320. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  321. return true
  322. }
  323. return false
  324. }
  325. func StopAiSafetyTask(ctx *context.Context) {
  326. }
  327. func DelAiSafetyTask(ctx *context.Context) {
  328. }
  329. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  330. t := time.Now()
  331. ctx.Data["PageIsCloudBrain"] = true
  332. ctx.Data["IsCreate"] = true
  333. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  334. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  335. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  336. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  337. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  338. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  339. ctx.Data["display_job_name"] = displayJobName
  340. prepareCloudbrainOneSpecs(ctx)
  341. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  342. if queuesDetail != nil {
  343. ctx.Data["QueuesDetail"] = queuesDetail
  344. }
  345. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  346. }
  347. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  348. t := time.Now()
  349. ctx.Data["PageIsCloudBrain"] = true
  350. ctx.Data["IsCreate"] = true
  351. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  352. ctx.Data["display_job_name"] = displayJobName
  353. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  354. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  355. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  356. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  357. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  358. var resourcePools modelarts.ResourcePool
  359. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  360. ctx.ServerError("json.Unmarshal failed:", err)
  361. }
  362. ctx.Data["resource_pools"] = resourcePools.Info
  363. var engines modelarts.Engine
  364. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  365. ctx.ServerError("json.Unmarshal failed:", err)
  366. }
  367. ctx.Data["engines"] = engines.Info
  368. var versionInfos modelarts.VersionInfo
  369. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  370. ctx.ServerError("json.Unmarshal failed:", err)
  371. }
  372. ctx.Data["engine_versions"] = versionInfos.Version
  373. prepareCloudbrainTwoInferenceSpecs(ctx)
  374. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  375. ctx.Data["WaitCount"] = waitCount
  376. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  377. }
  378. func AiSafetyCreateForPost(ctx *context.Context) {
  379. ctx.Data["PageIsCloudBrain"] = true
  380. displayJobName := ctx.Query("DisplayJobName")
  381. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  382. taskType := ctx.QueryInt("type")
  383. description := ctx.Query("Description")
  384. ctx.Data["description"] = description
  385. repo := ctx.Repo.Repository
  386. tpname := tplCloudBrainModelSafetyNewNpu
  387. if taskType == models.TypeCloudBrainOne {
  388. tpname = tplCloudBrainModelSafetyNewGpu
  389. }
  390. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  391. if err == nil {
  392. if len(tasks) != 0 {
  393. log.Error("the job name did already exist", ctx.Data["MsgID"])
  394. modelSafetyNewDataPrepare(ctx)
  395. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  396. return
  397. }
  398. } else {
  399. if !models.IsErrJobNotExist(err) {
  400. log.Error("system error, %v", err, ctx.Data["MsgID"])
  401. modelSafetyNewDataPrepare(ctx)
  402. ctx.RenderWithErr("system error", tpname, nil)
  403. return
  404. }
  405. }
  406. if !jobNamePattern.MatchString(jobName) {
  407. modelSafetyNewDataPrepare(ctx)
  408. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  409. return
  410. }
  411. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  412. if err != nil {
  413. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  414. modelSafetyNewDataPrepare(ctx)
  415. ctx.RenderWithErr("system error", tpname, nil)
  416. return
  417. } else {
  418. if count >= 1 {
  419. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  420. modelSafetyNewDataPrepare(ctx)
  421. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  422. return
  423. }
  424. }
  425. BootFile := ctx.Query("BootFile")
  426. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  427. if err != nil || !bootFileExist {
  428. log.Error("Get bootfile error:", err)
  429. modelSafetyNewDataPrepare(ctx)
  430. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  431. return
  432. }
  433. if taskType == models.TypeCloudBrainTwo {
  434. createForNPU(ctx, jobName)
  435. } else if taskType == models.TypeCloudBrainOne {
  436. createForGPU(ctx, jobName)
  437. }
  438. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  439. }
  440. func createForNPU(ctx *context.Context, jobName string) {
  441. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  442. BootFile := ctx.Query("BootFile")
  443. displayJobName := ctx.Query("DisplayJobName")
  444. description := ctx.Query("Description")
  445. engineID := ctx.QueryInt("EngineID")
  446. poolID := ctx.Query("PoolID")
  447. //image := strings.TrimSpace(ctx.Query("Image"))
  448. srcDataset := ctx.Query("srcDataset") //uuid
  449. combatDataset := ctx.Query("combatDataset") //uuid
  450. evaluationIndex := ctx.Query("evaluationIndex")
  451. Params := ctx.Query("RunParaList")
  452. specId := ctx.QueryInt64("SpecId")
  453. repo := ctx.Repo.Repository
  454. trainUrl := ctx.Query("TrainUrl")
  455. modelName := ctx.Query("ModelName")
  456. modelVersion := ctx.Query("ModelVersion")
  457. ckptName := ctx.Query("CkptName")
  458. ckptUrl := "/" + trainUrl + ckptName
  459. log.Info("ckpt url:" + ckptUrl)
  460. FlavorName := ctx.Query("FlavorName")
  461. EngineName := ctx.Query("EngineName")
  462. isLatestVersion := modelarts.IsLatestVersion
  463. VersionCount := modelarts.VersionCountOne
  464. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  465. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  466. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  467. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  468. log.Info("ckpt url:" + ckptUrl)
  469. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  470. JobType: models.JobTypeInference,
  471. ComputeResource: models.NPU,
  472. Cluster: models.OpenICluster,
  473. AiCenterCode: models.AICenterOfCloudBrainTwo})
  474. if err != nil || spec == nil {
  475. modelSafetyNewDataPrepare(ctx)
  476. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  477. return
  478. }
  479. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  480. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  481. modelSafetyNewDataPrepare(ctx)
  482. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  483. return
  484. }
  485. //todo: del the codeLocalPath
  486. _, err = ioutil.ReadDir(codeLocalPath)
  487. if err == nil {
  488. os.RemoveAll(codeLocalPath)
  489. }
  490. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  491. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  492. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  493. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  494. modelSafetyNewDataPrepare(ctx)
  495. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  496. return
  497. }
  498. //todo: upload code (send to file_server todo this work?)
  499. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  500. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  501. modelSafetyNewDataPrepare(ctx)
  502. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  503. return
  504. }
  505. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  506. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  507. modelSafetyNewDataPrepare(ctx)
  508. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  509. return
  510. }
  511. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  512. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  513. modelSafetyNewDataPrepare(ctx)
  514. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  515. return
  516. }
  517. var parameters models.Parameters
  518. param := make([]models.Parameter, 0)
  519. param = append(param, models.Parameter{
  520. Label: modelarts.ResultUrl,
  521. Value: "s3:/" + resultObsPath,
  522. }, models.Parameter{
  523. Label: modelarts.CkptUrl,
  524. Value: "s3:/" + ckptUrl,
  525. })
  526. uuid := srcDataset + ";" + combatDataset
  527. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  528. if err != nil {
  529. modelSafetyNewDataPrepare(ctx)
  530. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  531. return
  532. }
  533. dataPath := dataUrl
  534. jsondatas, err := json.Marshal(datasUrlList)
  535. if err != nil {
  536. log.Error("Failed to Marshal: %v", err)
  537. modelSafetyNewDataPrepare(ctx)
  538. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  539. return
  540. }
  541. if isMultiDataset {
  542. param = append(param, models.Parameter{
  543. Label: modelarts.MultiDataUrl,
  544. Value: string(jsondatas),
  545. })
  546. }
  547. existDeviceTarget := false
  548. if len(Params) != 0 {
  549. err := json.Unmarshal([]byte(Params), &parameters)
  550. if err != nil {
  551. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  552. modelSafetyNewDataPrepare(ctx)
  553. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  554. return
  555. }
  556. for _, parameter := range parameters.Parameter {
  557. if parameter.Label == modelarts.DeviceTarget {
  558. existDeviceTarget = true
  559. }
  560. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  561. param = append(param, models.Parameter{
  562. Label: parameter.Label,
  563. Value: parameter.Value,
  564. })
  565. }
  566. }
  567. }
  568. if !existDeviceTarget {
  569. param = append(param, models.Parameter{
  570. Label: modelarts.DeviceTarget,
  571. Value: modelarts.Ascend,
  572. })
  573. }
  574. req := &modelarts.GenerateInferenceJobReq{
  575. JobName: jobName,
  576. DisplayJobName: displayJobName,
  577. DataUrl: dataPath,
  578. Description: description,
  579. CodeObsPath: codeObsPath,
  580. BootFileUrl: codeObsPath + BootFile,
  581. BootFile: BootFile,
  582. TrainUrl: trainUrl,
  583. WorkServerNumber: 1,
  584. EngineID: int64(engineID),
  585. LogUrl: logObsPath,
  586. PoolID: poolID,
  587. Uuid: uuid,
  588. Parameters: param, //modelarts train parameters
  589. CommitID: commitID,
  590. BranchName: cloudbrain.DefaultBranchName,
  591. Params: Params,
  592. FlavorName: FlavorName,
  593. EngineName: EngineName,
  594. LabelName: evaluationIndex,
  595. IsLatestVersion: isLatestVersion,
  596. VersionCount: VersionCount,
  597. TotalVersionCount: modelarts.TotalVersionCount,
  598. ModelName: modelName,
  599. ModelVersion: modelVersion,
  600. CkptName: ckptName,
  601. ResultUrl: resultObsPath,
  602. Spec: spec,
  603. DatasetName: datasetNames,
  604. JobType: string(models.JobTypeModelSafety),
  605. }
  606. err = modelarts.GenerateInferenceJob(ctx, req)
  607. if err != nil {
  608. log.Error("GenerateTrainJob failed:%v", err.Error())
  609. modelSafetyNewDataPrepare(ctx)
  610. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  611. return
  612. }
  613. }
  614. func createForGPU(ctx *context.Context, jobName string) {
  615. BootFile := ctx.Query("boot_file")
  616. displayJobName := ctx.Query("display_job_name")
  617. description := ctx.Query("description")
  618. image := strings.TrimSpace(ctx.Query("image"))
  619. srcDataset := ctx.Query("src_dataset") //uuid
  620. combatDataset := ctx.Query("combat_dataset") //uuid
  621. evaluationIndex := ctx.Query("evaluationIndex")
  622. Params := ctx.Query("run_para_list")
  623. specId := ctx.QueryInt64("spec_id")
  624. TrainUrl := ctx.Query("train_url")
  625. CkptName := ctx.Query("ckpt_name")
  626. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  627. log.Info("ckpt url:" + ckptUrl)
  628. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  629. JobType: models.JobTypeBenchmark,
  630. ComputeResource: models.GPU,
  631. Cluster: models.OpenICluster,
  632. AiCenterCode: models.AICenterOfCloudBrainOne})
  633. if err != nil || spec == nil {
  634. modelSafetyNewDataPrepare(ctx)
  635. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  636. return
  637. }
  638. repo := ctx.Repo.Repository
  639. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  640. os.RemoveAll(codePath)
  641. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  642. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  643. modelSafetyNewDataPrepare(ctx)
  644. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  645. return
  646. }
  647. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  648. if err != nil {
  649. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  650. modelSafetyNewDataPrepare(ctx)
  651. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  652. return
  653. }
  654. uuid := srcDataset + ";" + combatDataset
  655. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  656. log.Info("uuid=" + uuid)
  657. if err != nil {
  658. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  659. modelSafetyNewDataPrepare(ctx)
  660. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  661. return
  662. }
  663. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  664. if err != nil {
  665. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  666. modelSafetyNewDataPrepare(ctx)
  667. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  668. return
  669. }
  670. log.Info("Command=" + command)
  671. req := cloudbrain.GenerateCloudBrainTaskReq{
  672. Ctx: ctx,
  673. DisplayJobName: displayJobName,
  674. JobName: jobName,
  675. Image: image,
  676. Command: command,
  677. Uuids: uuid,
  678. DatasetNames: datasetNames,
  679. DatasetInfos: datasetInfos,
  680. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  681. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  682. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  683. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  684. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  685. JobType: string(models.JobTypeModelSafety),
  686. Description: description,
  687. BranchName: cloudbrain.DefaultBranchName,
  688. BootFile: BootFile,
  689. Params: Params,
  690. CommitID: "",
  691. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  692. Spec: spec,
  693. LabelName: evaluationIndex,
  694. }
  695. err = cloudbrain.GenerateTask(req)
  696. if err != nil {
  697. modelSafetyNewDataPrepare(ctx)
  698. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  699. return
  700. }
  701. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  702. }
  703. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  704. var command string
  705. bootFile := strings.TrimSpace(BootFile)
  706. if !strings.HasSuffix(bootFile, ".py") {
  707. log.Error("bootFile(%s) format error", bootFile)
  708. return command, errors.New("bootFile format error")
  709. }
  710. var parameters models.Parameters
  711. var param string
  712. if len(params) != 0 {
  713. err := json.Unmarshal([]byte(params), &parameters)
  714. if err != nil {
  715. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  716. return command, err
  717. }
  718. for _, parameter := range parameters.Parameter {
  719. param += " --" + parameter.Label + "=" + parameter.Value
  720. }
  721. }
  722. param += " --modelname" + "=" + CkptName
  723. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  724. return command, nil
  725. }
  726. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  727. ctx.Data["PageIsCloudBrain"] = true
  728. t := time.Now()
  729. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  730. ctx.Data["display_job_name"] = displayJobName
  731. ctx.Data["command"] = cloudbrain.GetCloudbrainDebugCommand()
  732. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  733. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  734. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  735. ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath
  736. ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled
  737. if categories == nil {
  738. json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
  739. }
  740. ctx.Data["benchmark_categories"] = categories.Category
  741. ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
  742. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  743. if queuesDetail != nil {
  744. ctx.Data["QueuesDetail"] = queuesDetail
  745. }
  746. prepareCloudbrainOneSpecs(ctx)
  747. ctx.Data["params"] = ""
  748. ctx.Data["branchName"] = ctx.Repo.BranchName
  749. ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
  750. ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled
  751. ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
  752. ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled
  753. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  754. ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
  755. return nil
  756. }
  757. func getJsonContent(url string) (string, error) {
  758. resp, err := http.Get(url)
  759. if err != nil || resp.StatusCode != 200 {
  760. log.Info("Get organizations url error=" + err.Error())
  761. return "", err
  762. }
  763. bytes, err := ioutil.ReadAll(resp.Body)
  764. resp.Body.Close()
  765. if err != nil {
  766. log.Info("Get organizations url error=" + err.Error())
  767. return "", err
  768. }
  769. str := string(bytes)
  770. //log.Info("json str =" + str)
  771. return str, nil
  772. }