You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/grampus"
  20. "code.gitea.io/gitea/modules/log"
  21. "code.gitea.io/gitea/modules/modelarts"
  22. "code.gitea.io/gitea/modules/setting"
  23. "code.gitea.io/gitea/modules/storage"
  24. "code.gitea.io/gitea/modules/timeutil"
  25. "code.gitea.io/gitea/modules/util"
  26. "code.gitea.io/gitea/services/cloudbrain/resource"
  27. "code.gitea.io/gitea/services/reward/point/account"
  28. uuid "github.com/satori/go.uuid"
  29. )
  30. const (
  31. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  32. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  33. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  34. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  35. tplModelSafetyTestShow = "repo/modelsafety/show"
  36. )
  37. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  38. log.Info("start to create CloudBrainAiSafetyCreate")
  39. uuid := uuid.NewV4()
  40. id := uuid.String()
  41. seriaNoParas := ctx.Query("serialNo")
  42. fileName := ctx.Query("fileName")
  43. //if jobType == string(models.JobTypeBenchmark) {
  44. req := aisafety.TaskReq{
  45. UnionId: id,
  46. EvalName: "test1",
  47. EvalContent: "test1",
  48. TLPath: "test1",
  49. Indicators: []string{"ACC", "ASS"},
  50. CDName: "CIFAR10_1000_FGSM",
  51. BDName: "CIFAR10_1000基础数据集",
  52. }
  53. aisafety.GetAlgorithmList()
  54. if seriaNoParas != "" {
  55. aisafety.GetTaskStatus(seriaNoParas)
  56. } else {
  57. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  58. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  59. if err == nil {
  60. log.Info("serialNo=" + serialNo)
  61. time.Sleep(time.Duration(2) * time.Second)
  62. aisafety.GetTaskStatus(serialNo)
  63. } else {
  64. log.Info("CreateSafetyTask error," + err.Error())
  65. }
  66. }
  67. }
  68. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  69. if job == nil {
  70. log.Error("GetCloudbrainByJobID failed")
  71. return
  72. }
  73. syncAiSafetyTaskStatus(job)
  74. }
  75. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  76. ctx.Data["id"] = ctx.Params(":jobid")
  77. ctx.HTML(200, tplModelSafetyTestShow)
  78. }
  79. func GetAiSafetyTask(ctx *context.Context) {
  80. var ID = ctx.Params(":jobid")
  81. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  82. if err != nil {
  83. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  84. return
  85. }
  86. syncAiSafetyTaskStatus(job)
  87. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  88. job.BenchmarkType = "安全评测"
  89. job.BenchmarkTypeName = "Image Classification"
  90. ctx.JSON(200, job)
  91. }
  92. func StopAiSafetyTask(ctx *context.Context) {
  93. var ID = ctx.Params(":jobid")
  94. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  95. result := make(map[string]interface{})
  96. result["code"] = -1
  97. if err != nil {
  98. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  99. result["msg"] = "No such task."
  100. ctx.JSON(200, result)
  101. return
  102. }
  103. if isTaskNotFinished(task.Status) {
  104. if task.Type == models.TypeCloudBrainTwo {
  105. //queryTaskStatusFromCloudbrainTwo(job)
  106. } else if task.Type == models.TypeCloudBrainOne {
  107. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  108. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  109. result["msg"] = "cloudbrain.Already_stopped"
  110. ctx.JSON(200, result)
  111. return
  112. }
  113. err := cloudbrain.StopJob(task.JobID)
  114. if err != nil {
  115. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  116. result["msg"] = "cloudbrain.Stopped_failed"
  117. ctx.JSON(200, result)
  118. return
  119. }
  120. task.Status = string(models.JobStopped)
  121. if task.EndTime == 0 {
  122. task.EndTime = timeutil.TimeStampNow()
  123. }
  124. task.ComputeAndSetDuration()
  125. err = models.UpdateJob(task)
  126. if err != nil {
  127. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  128. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  129. ctx.JSON(200, result)
  130. return
  131. }
  132. }
  133. } else {
  134. if task.Status == string(models.ModelSafetyTesting) {
  135. //修改为Failed
  136. task.Status = string(models.JobStopped)
  137. if task.EndTime == 0 {
  138. task.EndTime = timeutil.TimeStampNow()
  139. }
  140. task.ComputeAndSetDuration()
  141. err = models.UpdateJob(task)
  142. if err != nil {
  143. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  144. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  145. ctx.JSON(200, result)
  146. return
  147. }
  148. } else {
  149. log.Info("The job is finished. status=" + task.Status)
  150. }
  151. }
  152. }
  153. func DelAiSafetyTask(ctx *context.Context) {
  154. var ID = ctx.Params(":jobid")
  155. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  156. result := make(map[string]interface{})
  157. result["code"] = 1
  158. if err != nil {
  159. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  160. result["msg"] = "No such task."
  161. ctx.JSON(200, result)
  162. return
  163. }
  164. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  165. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  166. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  167. ctx.JSON(200, result)
  168. return
  169. }
  170. if task.Type == models.TypeCloudBrainOne {
  171. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  172. }
  173. err = models.DeleteJob(task)
  174. if err != nil {
  175. result["msg"] = err.Error()
  176. ctx.JSON(200, result)
  177. return
  178. }
  179. result["code"] = 0
  180. result["msg"] = "Succeed"
  181. ctx.JSON(200, result)
  182. }
  183. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  184. if isTaskNotFinished(job.Status) {
  185. if job.Type == models.TypeCloudBrainTwo {
  186. queryTaskStatusFromCloudbrainTwo(job)
  187. } else if job.Type == models.TypeCloudBrainOne {
  188. queryTaskStatusFromCloudbrain(job)
  189. } else if job.Type == models.TypeC2Net {
  190. queryTaskStatusFromGrampus(job)
  191. }
  192. } else {
  193. if job.Status == string(models.ModelSafetyTesting) {
  194. queryTaskStatusFromModelSafetyTestServer(job)
  195. } else {
  196. log.Info("The job is finished. status=" + job.Status)
  197. }
  198. }
  199. }
  200. func TimerHandleModelSafetyTestTask() {
  201. tasks, err := models.GetModelSafetyTestTask()
  202. if err == nil {
  203. if tasks != nil && len(tasks) > 0 {
  204. for _, job := range tasks {
  205. syncAiSafetyTaskStatus(job)
  206. }
  207. } else {
  208. log.Info("query running model safety test task 0.")
  209. }
  210. } else {
  211. log.Info("query running model safety test task err." + err.Error())
  212. }
  213. }
  214. func queryTaskStatusFromGrampus(task *models.Cloudbrain) {
  215. log.Info("The task not finished,name=" + task.DisplayJobName)
  216. if task.DeletedAt.IsZero() { //normal record
  217. result, err := grampus.GetJob(task.JobID)
  218. resultJson, _ := json.Marshal(result)
  219. log.Info("resultJson=" + string(resultJson))
  220. if err != nil {
  221. log.Error("GetJob failed:" + err.Error())
  222. return
  223. }
  224. if result != nil {
  225. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  226. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  227. }
  228. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  229. if task.Status != models.GrampusStatusSucceeded {
  230. if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
  231. task.Duration = result.JobInfo.RunSec
  232. if task.Duration < 0 {
  233. task.Duration = 0
  234. }
  235. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  236. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  237. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  238. }
  239. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  240. task.EndTime = task.StartTime.Add(task.Duration)
  241. }
  242. task.CorrectCreateUnix()
  243. err = models.UpdateJob(task)
  244. if err != nil {
  245. log.Error("UpdateJob failed:" + err.Error())
  246. }
  247. }
  248. } else {
  249. task.Status = string(models.ModelSafetyTesting)
  250. err = models.UpdateJob(task)
  251. if err != nil {
  252. log.Error("UpdateJob failed:", err)
  253. }
  254. //send msg to beihang
  255. sendGPUInferenceResultToTest(task)
  256. }
  257. }
  258. }
  259. }
  260. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  261. log.Info("The task not finished,name=" + job.DisplayJobName)
  262. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  263. if err != nil {
  264. log.Info("query train job error." + err.Error())
  265. return
  266. }
  267. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  268. job.Duration = result.Duration
  269. job.TrainJobDuration = result.TrainJobDuration
  270. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  271. err = models.UpdateJob(job)
  272. if err != nil {
  273. log.Error("UpdateJob failed:", err)
  274. }
  275. } else {
  276. job.Status = string(models.ModelSafetyTesting)
  277. err = models.UpdateJob(job)
  278. if err != nil {
  279. log.Error("UpdateJob failed:", err)
  280. }
  281. //send msg to beihang
  282. sendNPUInferenceResultToTest(job)
  283. }
  284. }
  285. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  286. datasetname := job.DatasetName
  287. datasetnames := strings.Split(datasetname, ";")
  288. indicator := job.LabelName
  289. req := aisafety.TaskReq{
  290. UnionId: job.JobID,
  291. EvalName: job.DisplayJobName,
  292. EvalContent: job.Description,
  293. TLPath: "test",
  294. Indicators: strings.Split(indicator, ";"),
  295. CDName: datasetnames[1],
  296. BDName: datasetnames[0],
  297. }
  298. jsonContent := ""
  299. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  300. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  301. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  302. if err != nil {
  303. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  304. } else {
  305. defer body.Close()
  306. var data []byte
  307. p := make([]byte, 4096)
  308. var readErr error
  309. var readCount int
  310. for {
  311. readCount, readErr = body.Read(p)
  312. if readCount > 0 {
  313. data = append(data, p[:readCount]...)
  314. }
  315. if readErr != nil || readCount == 0 {
  316. break
  317. }
  318. }
  319. jsonContent = string(data)
  320. }
  321. if jsonContent != "" {
  322. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  323. if err == nil {
  324. //update serial no to db
  325. job.PreVersionName = serialNo
  326. err = models.UpdateJob(job)
  327. if err != nil {
  328. log.Error("UpdateJob failed:", err)
  329. }
  330. }
  331. } else {
  332. log.Info("The json is null. so set it failed.")
  333. //update task failed.
  334. job.Status = string(models.ModelArtsTrainJobFailed)
  335. err := models.UpdateJob(job)
  336. if err != nil {
  337. log.Error("UpdateJob failed:", err)
  338. }
  339. }
  340. }
  341. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  342. log.Info("The task not finished,name=" + job.DisplayJobName)
  343. jobResult, err := cloudbrain.GetJob(job.JobID)
  344. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  345. if err != nil {
  346. log.Error("ConvertToJobResultPayload failed:", err)
  347. return
  348. }
  349. job.Status = result.JobStatus.State
  350. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  351. taskRoles := result.TaskRoles
  352. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  353. job.Status = taskRes.TaskStatuses[0].State
  354. }
  355. if result.JobStatus.State != string(models.JobSucceeded) {
  356. err = models.UpdateJob(job)
  357. if err != nil {
  358. log.Error("UpdateJob failed:", err)
  359. }
  360. } else {
  361. //
  362. job.Status = string(models.ModelSafetyTesting)
  363. err = models.UpdateJob(job)
  364. if err != nil {
  365. log.Error("UpdateJob failed:", err)
  366. }
  367. //send msg to beihang
  368. sendGPUInferenceResultToTest(job)
  369. }
  370. }
  371. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  372. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  373. if err == nil {
  374. if result.Code == "0" {
  375. if result.Data.Status == 1 {
  376. log.Info("The task is running....")
  377. } else {
  378. if result.Data.Code == 0 {
  379. job.ResultJson = result.Data.StandardJson
  380. err = models.UpdateJob(job)
  381. if err != nil {
  382. log.Error("UpdateJob failed:", err)
  383. }
  384. }
  385. }
  386. } else {
  387. log.Info("The task is failed.")
  388. job.Status = string(models.JobFailed)
  389. err = models.UpdateJob(job)
  390. if err != nil {
  391. log.Error("UpdateJob failed:", err)
  392. }
  393. }
  394. } else {
  395. log.Info("The task not found.....")
  396. }
  397. }
  398. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  399. datasetname := job.DatasetName
  400. datasetnames := strings.Split(datasetname, ";")
  401. indicator := job.LabelName
  402. req := aisafety.TaskReq{
  403. UnionId: job.JobID,
  404. EvalName: job.DisplayJobName,
  405. EvalContent: job.Description,
  406. TLPath: "test",
  407. Indicators: strings.Split(indicator, ";"),
  408. CDName: datasetnames[1],
  409. BDName: datasetnames[0],
  410. }
  411. resultDir := "/model"
  412. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  413. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  414. if err != nil {
  415. log.Error("query cloudbrain one model failed: %v", err)
  416. return
  417. }
  418. jsonContent := ""
  419. for _, file := range files {
  420. if strings.HasSuffix(file.FileName, "result.json") {
  421. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  422. log.Info("path=" + path)
  423. reader, err := os.Open(path)
  424. defer reader.Close()
  425. if err == nil {
  426. r := bufio.NewReader(reader)
  427. for {
  428. line, error := r.ReadString('\n')
  429. if error == io.EOF {
  430. log.Info("read file completed.")
  431. break
  432. }
  433. if error != nil {
  434. log.Info("read file error." + error.Error())
  435. break
  436. }
  437. jsonContent += line
  438. }
  439. }
  440. break
  441. }
  442. }
  443. if jsonContent != "" {
  444. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  445. if err == nil {
  446. //update serial no to db
  447. job.PreVersionName = serialNo
  448. err = models.UpdateJob(job)
  449. if err != nil {
  450. log.Error("UpdateJob failed:", err)
  451. }
  452. }
  453. } else {
  454. log.Info("The json is null. so set it failed.")
  455. //update task failed.
  456. job.Status = string(models.JobFailed)
  457. err = models.UpdateJob(job)
  458. if err != nil {
  459. log.Error("UpdateJob failed:", err)
  460. }
  461. }
  462. }
  463. func isTaskNotFinished(status string) bool {
  464. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  465. return true
  466. }
  467. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  468. return true
  469. }
  470. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  471. return true
  472. }
  473. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  474. return true
  475. }
  476. return false
  477. }
  478. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  479. t := time.Now()
  480. ctx.Data["PageIsCloudBrain"] = true
  481. ctx.Data["IsCreate"] = true
  482. ctx.Data["type"] = models.TypeCloudBrainOne
  483. ctx.Data["compute_resource"] = models.GPUResource
  484. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  485. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  486. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  487. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  488. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  489. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  490. ctx.Data["display_job_name"] = displayJobName
  491. prepareCloudbrainOneSpecs(ctx)
  492. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  493. if queuesDetail != nil {
  494. ctx.Data["QueuesDetail"] = queuesDetail
  495. }
  496. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  497. }
  498. func AiSafetyCreateForGetGrampusGPU(ctx *context.Context) {
  499. ctx.Data["PageIsCloudBrain"] = true
  500. ctx.Data["IsCreate"] = true
  501. ctx.Data["type"] = models.TypeC2Net
  502. ctx.Data["compute_resource"] = models.GPUResource
  503. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  504. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  505. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  506. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  507. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  508. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  509. if err != nil {
  510. ctx.ServerError("get new train-job info failed", err)
  511. return
  512. }
  513. ctx.HTML(200, tplModelSafetyTestCreateGrampusGpu)
  514. }
  515. func AiSafetyCreateForGetGrampusNPU(ctx *context.Context) {
  516. ctx.Data["PageIsCloudBrain"] = true
  517. ctx.Data["IsCreate"] = true
  518. ctx.Data["type"] = models.TypeC2Net
  519. ctx.Data["compute_resource"] = models.NPUResource
  520. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  521. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  522. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  523. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  524. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  525. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  526. if err != nil {
  527. ctx.ServerError("get new train-job info failed", err)
  528. return
  529. }
  530. ctx.HTML(200, tplModelSafetyTestCreateGrampusNpu)
  531. }
  532. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  533. t := time.Now()
  534. ctx.Data["PageIsCloudBrain"] = true
  535. ctx.Data["IsCreate"] = true
  536. ctx.Data["type"] = models.TypeCloudBrainTwo
  537. ctx.Data["compute_resource"] = models.NPUResource
  538. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  539. ctx.Data["display_job_name"] = displayJobName
  540. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  541. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  542. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  543. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  544. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  545. var resourcePools modelarts.ResourcePool
  546. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  547. ctx.ServerError("json.Unmarshal failed:", err)
  548. }
  549. ctx.Data["resource_pools"] = resourcePools.Info
  550. var engines modelarts.Engine
  551. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  552. ctx.ServerError("json.Unmarshal failed:", err)
  553. }
  554. ctx.Data["engines"] = engines.Info
  555. var versionInfos modelarts.VersionInfo
  556. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  557. ctx.ServerError("json.Unmarshal failed:", err)
  558. }
  559. ctx.Data["engine_versions"] = versionInfos.Version
  560. prepareCloudbrainTwoInferenceSpecs(ctx)
  561. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  562. ctx.Data["WaitCount"] = waitCount
  563. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  564. }
  565. func AiSafetyCreateForPost(ctx *context.Context) {
  566. ctx.Data["PageIsCloudBrain"] = true
  567. displayJobName := ctx.Query("display_job_name")
  568. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  569. taskType := ctx.QueryInt("type")
  570. description := ctx.Query("description")
  571. ctx.Data["description"] = description
  572. repo := ctx.Repo.Repository
  573. tpname := tplCloudBrainModelSafetyNewNpu
  574. if taskType == models.TypeCloudBrainOne {
  575. tpname = tplCloudBrainModelSafetyNewGpu
  576. }
  577. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  578. if err == nil {
  579. if len(tasks) != 0 {
  580. log.Error("the job name did already exist", ctx.Data["MsgID"])
  581. modelSafetyNewDataPrepare(ctx)
  582. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  583. return
  584. }
  585. } else {
  586. if !models.IsErrJobNotExist(err) {
  587. log.Error("system error, %v", err, ctx.Data["MsgID"])
  588. modelSafetyNewDataPrepare(ctx)
  589. ctx.RenderWithErr("system error", tpname, nil)
  590. return
  591. }
  592. }
  593. if !jobNamePattern.MatchString(jobName) {
  594. modelSafetyNewDataPrepare(ctx)
  595. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  596. return
  597. }
  598. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  599. if err != nil {
  600. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  601. modelSafetyNewDataPrepare(ctx)
  602. ctx.RenderWithErr("system error", tpname, nil)
  603. return
  604. } else {
  605. if count >= 1 {
  606. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  607. modelSafetyNewDataPrepare(ctx)
  608. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  609. return
  610. }
  611. }
  612. BootFile := ctx.Query("boot_file")
  613. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  614. if err != nil || !bootFileExist {
  615. log.Error("Get bootfile error:", err)
  616. modelSafetyNewDataPrepare(ctx)
  617. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  618. return
  619. }
  620. if taskType == models.TypeCloudBrainTwo {
  621. createForNPU(ctx, jobName)
  622. } else if taskType == models.TypeCloudBrainOne {
  623. createForGPU(ctx, jobName)
  624. } else if taskType == models.TypeC2Net {
  625. ComputeResource := ctx.Query("compute_resource")
  626. if ComputeResource == models.NPUResource {
  627. createForGrampusNPU(ctx, jobName)
  628. } else if ComputeResource == models.GPUResource {
  629. createForGrampusGPU(ctx, jobName)
  630. }
  631. }
  632. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  633. }
  634. func createForGrampusGPU(ctx *context.Context, jobName string) {
  635. BootFile := ctx.Query("boot_file")
  636. displayJobName := ctx.Query("display_job_name")
  637. description := ctx.Query("description")
  638. image := strings.TrimSpace(ctx.Query("image"))
  639. srcDataset := ctx.Query("src_dataset") //uuid
  640. combatDataset := ctx.Query("combat_dataset") //uuid
  641. evaluationIndex := ctx.Query("evaluation_index")
  642. Params := ctx.Query("run_para_list")
  643. specId := ctx.QueryInt64("spec_id")
  644. TrainUrl := ctx.Query("train_url")
  645. CkptName := ctx.Query("ckpt_name")
  646. ModelName := ctx.Query("ModelName")
  647. ModelVersion := ctx.Query("ModelVersion")
  648. repo := ctx.Repo.Repository
  649. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  650. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  651. //check specification
  652. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  653. JobType: models.JobTypeTrain,
  654. ComputeResource: models.GPU,
  655. Cluster: models.C2NetCluster,
  656. })
  657. if err != nil || spec == nil {
  658. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  659. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  660. return
  661. }
  662. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  663. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  664. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  665. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  666. return
  667. }
  668. //check dataset
  669. uuid := srcDataset + ";" + combatDataset
  670. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  671. if err != nil {
  672. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  673. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  674. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  675. return
  676. }
  677. //prepare code and out path
  678. _, err = ioutil.ReadDir(codeLocalPath)
  679. if err == nil {
  680. os.RemoveAll(codeLocalPath)
  681. }
  682. if err := downloadZipCode(ctx, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  683. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  684. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  685. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  686. return
  687. }
  688. //todo: upload code (send to file_server todo this work?)
  689. //upload code
  690. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  691. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  692. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  693. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  694. return
  695. }
  696. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  697. if err := mkModelPath(modelPath); err != nil {
  698. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  699. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  700. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  701. return
  702. }
  703. //init model readme
  704. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  705. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  706. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  707. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  708. return
  709. }
  710. var datasetRemotePath, allFileName string
  711. for _, datasetInfo := range datasetInfos {
  712. if datasetRemotePath == "" {
  713. datasetRemotePath = datasetInfo.DataLocalPath
  714. allFileName = datasetInfo.FullName
  715. } else {
  716. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  717. allFileName = allFileName + ";" + datasetInfo.FullName
  718. }
  719. }
  720. //prepare command
  721. preTrainModelPath := getPreTrainModelPath(TrainUrl, CkptName)
  722. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, BootFile, Params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, CkptName)
  723. if err != nil {
  724. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  725. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  726. ctx.RenderWithErr("Create task failed, internal error", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  727. return
  728. }
  729. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  730. req := &grampus.GenerateTrainJobReq{
  731. JobName: jobName,
  732. DisplayJobName: displayJobName,
  733. ComputeResource: models.GPUResource,
  734. ProcessType: grampus.ProcessorTypeGPU,
  735. Command: command,
  736. ImageUrl: image,
  737. Description: description,
  738. BootFile: BootFile,
  739. Uuid: uuid,
  740. CommitID: commitID,
  741. BranchName: cloudbrain.DefaultBranchName,
  742. Params: Params,
  743. EngineName: image,
  744. DatasetNames: datasetNames,
  745. DatasetInfos: datasetInfos,
  746. IsLatestVersion: modelarts.IsLatestVersion,
  747. VersionCount: modelarts.VersionCountOne,
  748. WorkServerNumber: 1,
  749. Spec: spec,
  750. ModelName: ModelName,
  751. LabelName: evaluationIndex,
  752. CkptName: CkptName,
  753. ModelVersion: ModelVersion,
  754. PreTrainModelUrl: TrainUrl,
  755. }
  756. err = grampus.GenerateTrainJob(ctx, req)
  757. if err != nil {
  758. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  759. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  760. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  761. return
  762. }
  763. }
  764. func createForGrampusNPU(ctx *context.Context, jobName string) {
  765. }
  766. func createForNPU(ctx *context.Context, jobName string) {
  767. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  768. BootFile := ctx.Query("boot_file")
  769. displayJobName := ctx.Query("display_job_name")
  770. description := ctx.Query("description")
  771. srcDataset := ctx.Query("src_dataset") //uuid
  772. combatDataset := ctx.Query("combat_dataset") //uuid
  773. evaluationIndex := ctx.Query("evaluation_index")
  774. Params := ctx.Query("run_para_list")
  775. specId := ctx.QueryInt64("spec_id")
  776. engineID := ctx.QueryInt("engine_id")
  777. log.Info("engine_id=" + fmt.Sprint(engineID))
  778. poolID := ctx.Query("pool_id")
  779. repo := ctx.Repo.Repository
  780. trainUrl := ctx.Query("train_url")
  781. modelName := ctx.Query("model_name")
  782. modelVersion := ctx.Query("model_version")
  783. ckptName := ctx.Query("ckpt_name")
  784. ckptUrl := "/" + trainUrl + ckptName
  785. log.Info("ckpt url:" + ckptUrl)
  786. FlavorName := ctx.Query("flaver_names")
  787. EngineName := ctx.Query("engine_names")
  788. isLatestVersion := modelarts.IsLatestVersion
  789. VersionCount := modelarts.VersionCountOne
  790. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  791. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  792. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  793. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  794. log.Info("ckpt url:" + ckptUrl)
  795. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  796. JobType: models.JobTypeInference,
  797. ComputeResource: models.NPU,
  798. Cluster: models.OpenICluster,
  799. AiCenterCode: models.AICenterOfCloudBrainTwo})
  800. if err != nil || spec == nil {
  801. modelSafetyNewDataPrepare(ctx)
  802. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  803. return
  804. }
  805. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  806. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  807. modelSafetyNewDataPrepare(ctx)
  808. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  809. return
  810. }
  811. //todo: del the codeLocalPath
  812. _, err = ioutil.ReadDir(codeLocalPath)
  813. if err == nil {
  814. os.RemoveAll(codeLocalPath)
  815. }
  816. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  817. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  818. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  819. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  820. modelSafetyNewDataPrepare(ctx)
  821. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  822. return
  823. }
  824. //todo: upload code (send to file_server todo this work?)
  825. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  826. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  827. modelSafetyNewDataPrepare(ctx)
  828. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  829. return
  830. }
  831. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  832. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  833. modelSafetyNewDataPrepare(ctx)
  834. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  835. return
  836. }
  837. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  838. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  839. modelSafetyNewDataPrepare(ctx)
  840. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  841. return
  842. }
  843. var parameters models.Parameters
  844. param := make([]models.Parameter, 0)
  845. param = append(param, models.Parameter{
  846. Label: modelarts.ResultUrl,
  847. Value: "s3:/" + resultObsPath,
  848. }, models.Parameter{
  849. Label: modelarts.CkptUrl,
  850. Value: "s3:/" + ckptUrl,
  851. })
  852. uuid := srcDataset + ";" + combatDataset
  853. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  854. if err != nil {
  855. modelSafetyNewDataPrepare(ctx)
  856. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  857. return
  858. }
  859. dataPath := dataUrl
  860. jsondatas, err := json.Marshal(datasUrlList)
  861. if err != nil {
  862. log.Error("Failed to Marshal: %v", err)
  863. modelSafetyNewDataPrepare(ctx)
  864. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  865. return
  866. }
  867. if isMultiDataset {
  868. param = append(param, models.Parameter{
  869. Label: modelarts.MultiDataUrl,
  870. Value: string(jsondatas),
  871. })
  872. }
  873. existDeviceTarget := false
  874. if len(Params) != 0 {
  875. err := json.Unmarshal([]byte(Params), &parameters)
  876. if err != nil {
  877. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  878. modelSafetyNewDataPrepare(ctx)
  879. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  880. return
  881. }
  882. for _, parameter := range parameters.Parameter {
  883. if parameter.Label == modelarts.DeviceTarget {
  884. existDeviceTarget = true
  885. }
  886. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  887. param = append(param, models.Parameter{
  888. Label: parameter.Label,
  889. Value: parameter.Value,
  890. })
  891. }
  892. }
  893. }
  894. if !existDeviceTarget {
  895. param = append(param, models.Parameter{
  896. Label: modelarts.DeviceTarget,
  897. Value: modelarts.Ascend,
  898. })
  899. }
  900. req := &modelarts.GenerateInferenceJobReq{
  901. JobName: jobName,
  902. DisplayJobName: displayJobName,
  903. DataUrl: dataPath,
  904. Description: description,
  905. CodeObsPath: codeObsPath,
  906. BootFileUrl: codeObsPath + BootFile,
  907. BootFile: BootFile,
  908. TrainUrl: trainUrl,
  909. WorkServerNumber: 1,
  910. EngineID: int64(engineID),
  911. LogUrl: logObsPath,
  912. PoolID: poolID,
  913. Uuid: uuid,
  914. Parameters: param, //modelarts train parameters
  915. CommitID: commitID,
  916. BranchName: cloudbrain.DefaultBranchName,
  917. Params: Params,
  918. FlavorName: FlavorName,
  919. EngineName: EngineName,
  920. LabelName: evaluationIndex,
  921. IsLatestVersion: isLatestVersion,
  922. VersionCount: VersionCount,
  923. TotalVersionCount: modelarts.TotalVersionCount,
  924. ModelName: modelName,
  925. ModelVersion: modelVersion,
  926. CkptName: ckptName,
  927. ResultUrl: resultObsPath,
  928. Spec: spec,
  929. DatasetName: datasetNames,
  930. JobType: string(models.JobTypeModelSafety),
  931. }
  932. err = modelarts.GenerateInferenceJob(ctx, req)
  933. if err != nil {
  934. log.Error("GenerateTrainJob failed:%v", err.Error())
  935. modelSafetyNewDataPrepare(ctx)
  936. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  937. return
  938. }
  939. }
  940. func createForGPU(ctx *context.Context, jobName string) {
  941. BootFile := ctx.Query("boot_file")
  942. displayJobName := ctx.Query("display_job_name")
  943. description := ctx.Query("description")
  944. image := strings.TrimSpace(ctx.Query("image"))
  945. srcDataset := ctx.Query("src_dataset") //uuid
  946. combatDataset := ctx.Query("combat_dataset") //uuid
  947. evaluationIndex := ctx.Query("evaluation_index")
  948. Params := ctx.Query("run_para_list")
  949. specId := ctx.QueryInt64("spec_id")
  950. TrainUrl := ctx.Query("train_url")
  951. CkptName := ctx.Query("ckpt_name")
  952. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  953. log.Info("ckpt url:" + ckptUrl)
  954. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  955. JobType: models.JobTypeBenchmark,
  956. ComputeResource: models.GPU,
  957. Cluster: models.OpenICluster,
  958. AiCenterCode: models.AICenterOfCloudBrainOne})
  959. if err != nil || spec == nil {
  960. modelSafetyNewDataPrepare(ctx)
  961. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  962. return
  963. }
  964. repo := ctx.Repo.Repository
  965. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  966. os.RemoveAll(codePath)
  967. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  968. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  969. modelSafetyNewDataPrepare(ctx)
  970. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  971. return
  972. }
  973. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  974. if err != nil {
  975. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  976. modelSafetyNewDataPrepare(ctx)
  977. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  978. return
  979. }
  980. uuid := srcDataset + ";" + combatDataset
  981. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  982. log.Info("uuid=" + uuid)
  983. if err != nil {
  984. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  985. modelSafetyNewDataPrepare(ctx)
  986. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  987. return
  988. }
  989. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  990. if err != nil {
  991. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  992. modelSafetyNewDataPrepare(ctx)
  993. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  994. return
  995. }
  996. log.Info("Command=" + command)
  997. req := cloudbrain.GenerateCloudBrainTaskReq{
  998. Ctx: ctx,
  999. DisplayJobName: displayJobName,
  1000. JobName: jobName,
  1001. Image: image,
  1002. Command: command,
  1003. Uuids: uuid,
  1004. DatasetNames: datasetNames,
  1005. DatasetInfos: datasetInfos,
  1006. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  1007. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  1008. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  1009. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  1010. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  1011. JobType: string(models.JobTypeModelSafety),
  1012. Description: description,
  1013. BranchName: cloudbrain.DefaultBranchName,
  1014. BootFile: BootFile,
  1015. Params: Params,
  1016. CommitID: "",
  1017. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  1018. Spec: spec,
  1019. LabelName: evaluationIndex,
  1020. }
  1021. err = cloudbrain.GenerateTask(req)
  1022. if err != nil {
  1023. modelSafetyNewDataPrepare(ctx)
  1024. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  1025. return
  1026. }
  1027. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  1028. }
  1029. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  1030. var command string
  1031. bootFile := strings.TrimSpace(BootFile)
  1032. if !strings.HasSuffix(bootFile, ".py") {
  1033. log.Error("bootFile(%s) format error", bootFile)
  1034. return command, errors.New("bootFile format error")
  1035. }
  1036. var parameters models.Parameters
  1037. var param string
  1038. if len(params) != 0 {
  1039. err := json.Unmarshal([]byte(params), &parameters)
  1040. if err != nil {
  1041. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1042. return command, err
  1043. }
  1044. for _, parameter := range parameters.Parameter {
  1045. param += " --" + parameter.Label + "=" + parameter.Value
  1046. }
  1047. }
  1048. param += " --modelname" + "=" + CkptName
  1049. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  1050. return command, nil
  1051. }
  1052. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  1053. ctx.Data["PageIsCloudBrain"] = true
  1054. ctx.Data["boot_file"] = ctx.Query("boot_file")
  1055. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  1056. ctx.Data["description"] = ctx.Query("description")
  1057. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  1058. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  1059. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  1060. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  1061. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  1062. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  1063. ctx.Data["train_url"] = ctx.Query("train_url")
  1064. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1065. prepareCloudbrainOneSpecs(ctx)
  1066. return nil
  1067. }
  1068. func getJsonContent(url string) (string, error) {
  1069. resp, err := http.Get(url)
  1070. if err != nil || resp.StatusCode != 200 {
  1071. log.Info("Get organizations url error=" + err.Error())
  1072. return "", err
  1073. }
  1074. bytes, err := ioutil.ReadAll(resp.Body)
  1075. resp.Body.Close()
  1076. if err != nil {
  1077. log.Info("Get organizations url error=" + err.Error())
  1078. return "", err
  1079. }
  1080. str := string(bytes)
  1081. //log.Info("json str =" + str)
  1082. return str, nil
  1083. }