diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c098ef18..1d93b3ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,6 +118,9 @@ jobs: - name: "Rust Dataflow example" timeout-minutes: 30 run: cargo run --example rust-dataflow + - name: "Rust Git Dataflow example" + timeout-minutes: 30 + run: cargo run --example rust-dataflow-git - name: "Multiple Daemons example" timeout-minutes: 30 run: cargo run --example multiple-daemons @@ -209,11 +212,11 @@ jobs: source /opt/ros/humble/setup.bash && ros2 run turtlesim turtlesim_node & source /opt/ros/humble/setup.bash && ros2 run examples_rclcpp_minimal_service service_main & cargo run --example rust-ros2-dataflow --features="ros2-examples" - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 if: runner.os != 'Windows' with: python-version: "3.8" - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 if: runner.os == 'Windows' with: python-version: "3.10" @@ -321,7 +324,7 @@ jobs: dora stop --name ci-rust-dynamic --grace-duration 5s dora destroy - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: # TODO: Support Python 3.13 when https://github.com/pytorch/pytorch/issues/130249 is fixed python-version: "3.12" @@ -339,35 +342,42 @@ jobs: # Test Python template Project dora new test_python_project --lang python --internal-create-with-path-dependencies cd test_python_project - uv venv --seed -p 3.11 + uv venv --seed -p 3.12 uv pip install -e ../apis/python/node - dora build dataflow.yml --uv uv pip install ruff pytest + echo "Running dora up" + dora up + echo "Running dora build" + dora build dataflow.yml --uv + # Check Compliancy uv run ruff check . uv run pytest export OPERATING_MODE=SAVE - dora up + echo "Running dora list" dora list dora build dataflow.yml --uv echo "Running CI Python Test" dora start dataflow.yml --name ci-python-test --detach --uv sleep 10 + echo "Running dora stop" dora stop --name ci-python-test --grace-duration 5s dora destroy + sleep 5 cd .. # Run Python Node Example echo "Running Python Node Example" dora up - uv venv --seed -p 3.11 + uv venv --seed -p 3.12 uv pip install -e apis/python/node dora build examples/python-dataflow/dataflow.yml --uv dora start examples/python-dataflow/dataflow.yml --name ci-python --detach --uv sleep 10 + echo "Running dora stop" dora stop --name ci-python --grace-duration 30s # Run Python Dynamic Node Example @@ -376,15 +386,18 @@ jobs: dora start examples/python-dataflow/dataflow_dynamic.yml --name ci-python-dynamic --detach --uv uv run opencv-plot --name plot sleep 10 + echo "Running dora stop" dora stop --name ci-python-dynamic --grace-duration 30s # Run Python Operator Example echo "Running CI Operator Test" dora start examples/python-operator-dataflow/dataflow.yml --name ci-python-operator --detach --uv sleep 10 + echo "Running dora stop" dora stop --name ci-python-operator --grace-duration 30s dora destroy + sleep 5 # Run Python queue latency test echo "Running CI Queue Latency Test" diff --git a/.github/workflows/pip-release.yml b/.github/workflows/pip-release.yml index 98465b85..711698fb 100644 --- a/.github/workflows/pip-release.yml +++ b/.github/workflows/pip-release.yml @@ -66,6 +66,7 @@ jobs: args: --release --out dist --zig manylinux: manylinux_2_28 working-directory: ${{ matrix.repository.path }} + before-script-linux: sudo apt-get install libatomic1-i386-cross libatomic1-armhf-cross && mkdir -p $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/ && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so.1 && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 /opt/hostedtoolcache/Python/3.8.18/x64/lib/libatomic.so.1 && mkdir -p $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/armv7-unknown-linux-gnueabihf/lib/ && ln -s /usr/arm-linux-gnueabihf/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/armv7-unknown-linux-gnueabihf/lib/libatomic.so - name: Upload wheels if: github.event_name == 'release' uses: actions/upload-artifact@v4 diff --git a/.gitignore b/.gitignore index fcd517fe..d05ea4a4 100644 --- a/.gitignore +++ b/.gitignore @@ -35,7 +35,7 @@ __pycache__/ # Distribution / packaging .Python -build/ +/build/ develop-eggs/ dist/ downloads/ @@ -180,4 +180,4 @@ out/ #Miscellaneous yolo.yml -~* \ No newline at end of file +~* diff --git a/Cargo.lock b/Cargo.lock index e3bf55bf..c56d68ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -687,7 +687,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror 1.0.69", - "time 0.3.41", + "time", ] [[package]] @@ -1193,7 +1193,7 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "sync_wrapper 1.0.2", + "sync_wrapper", "tower 0.5.2", "tower-layer", "tower-service", @@ -1214,7 +1214,7 @@ dependencies = [ "mime", "pin-project-lite", "rustversion", - "sync_wrapper 1.0.2", + "sync_wrapper", "tower-layer", "tower-service", ] @@ -1290,7 +1290,7 @@ dependencies = [ "path_abs", "plist", "regex", - "semver 1.0.26", + "semver", "serde", "serde_yaml 0.9.34+deprecated", "shell-words", @@ -1713,7 +1713,7 @@ checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" dependencies = [ "camino", "cargo-platform", - "semver 1.0.26", + "semver", "serde", "serde_json", ] @@ -1726,7 +1726,7 @@ checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037" dependencies = [ "camino", "cargo-platform", - "semver 1.0.26", + "semver", "serde", "serde_json", "thiserror 1.0.69", @@ -2455,6 +2455,33 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96a6ac251f4a2aca6b3f91340350eab87ae57c3f127ffeb585e92bd336717991" +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.101", +] + [[package]] name = "cxx" version = "1.0.149" @@ -2911,14 +2938,17 @@ dependencies = [ "dora-operator-api-c", "dora-runtime", "dora-tracing", + "dunce", "duration-str", "env_logger 0.11.6", "eyre", "futures", + "git2", "inquire", "log", "notify 5.2.0", "pyo3", + "pyo3-build-config", "self-replace", "self_update", "serde", @@ -2929,6 +2959,7 @@ dependencies = [ "tokio", "tokio-stream", "tracing", + "tracing-log 0.2.0", "uuid 1.16.0", "webbrowser 0.8.15", ] @@ -2959,7 +2990,11 @@ name = "dora-core" version = "0.3.11" dependencies = [ "dora-message", + "dunce", "eyre", + "fs_extra", + "git2", + "itertools 0.14.0", "log", "once_cell", "schemars", @@ -2969,6 +3004,7 @@ dependencies = [ "serde_yaml 0.9.34+deprecated", "tokio", "tracing", + "url", "uuid 1.16.0", "which", ] @@ -2989,18 +3025,22 @@ dependencies = [ "dora-message", "dora-node-api", "dora-tracing", + "dunce", "eyre", "flume 0.10.14", "futures", "futures-concurrency", + "git2", + "itertools 0.14.0", "serde_json", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "shared-memory-server", "sysinfo 0.30.13", "tokio", "tokio-stream", "tracing", "tracing-opentelemetry", + "url", "uuid 1.16.0", "which", "zenoh 1.3.0", @@ -3025,7 +3065,7 @@ name = "dora-download" version = "0.3.11" dependencies = [ "eyre", - "reqwest 0.12.15", + "reqwest", "tokio", "tracing", ] @@ -3034,6 +3074,7 @@ dependencies = [ name = "dora-examples" version = "0.0.0" dependencies = [ + "dora-cli", "dora-coordinator", "dora-core", "dora-download", @@ -3065,7 +3106,7 @@ dependencies = [ [[package]] name = "dora-message" -version = "0.4.4" +version = "0.5.0-alpha" dependencies = [ "aligned-vec", "arrow-data", @@ -3075,7 +3116,7 @@ dependencies = [ "log", "once_cell", "schemars", - "semver 1.0.26", + "semver", "serde", "serde-with-expand-env", "serde_yaml 0.9.34+deprecated", @@ -3123,7 +3164,7 @@ dependencies = [ "futures-concurrency", "futures-timer", "serde_json", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "shared-memory-server", "shared_memory_extended", "tokio", @@ -3163,7 +3204,7 @@ name = "dora-node-api-python" version = "0.3.11" dependencies = [ "arrow", - "dora-daemon", + "dora-cli", "dora-download", "dora-node-api", "dora-operator-api-python", @@ -3173,8 +3214,9 @@ dependencies = [ "flume 0.10.14", "futures", "pyo3", + "pyo3-build-config", "pythonize", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "tokio", ] @@ -3255,7 +3297,7 @@ dependencies = [ "futures", "futures-concurrency", "pyo3", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", ] [[package]] @@ -3381,7 +3423,7 @@ dependencies = [ "libloading 0.7.4", "pyo3", "pythonize", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "tokio", "tokio-stream", "tracing", @@ -3446,7 +3488,7 @@ dependencies = [ "rust_decimal", "serde", "thiserror 1.0.69", - "time 0.3.41", + "time", ] [[package]] @@ -3483,6 +3525,31 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18aade80d5e09429040243ce1143ddc08a92d7a22820ac512610410a4dd5214f" +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8 0.10.2", + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "signature 2.2.0", + "subtle", + "zeroize", +] + [[package]] name = "eframe" version = "0.31.1" @@ -4102,6 +4169,12 @@ dependencies = [ "anyhow", ] +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.25" @@ -4257,6 +4330,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -4679,6 +4758,8 @@ dependencies = [ "libc", "libgit2-sys", "log", + "openssl-probe", + "openssl-sys", "url", ] @@ -5073,12 +5154,12 @@ dependencies = [ "dirs 5.0.1", "futures", "http 1.3.1", - "indicatif 0.17.11", + "indicatif", "libc", "log", "num_cpus", "rand 0.8.5", - "reqwest 0.12.15", + "reqwest", "serde", "serde_json", "thiserror 2.0.12", @@ -5188,7 +5269,7 @@ dependencies = [ "http 1.3.1", "http-cache", "http-cache-semantics", - "reqwest 0.12.15", + "reqwest", "reqwest-middleware", "serde", "url", @@ -5203,7 +5284,7 @@ dependencies = [ "http 1.3.1", "http-serde", "serde", - "time 0.3.41", + "time", ] [[package]] @@ -5279,20 +5360,6 @@ dependencies = [ "want", ] -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "rustls 0.21.12", - "tokio", - "tokio-rustls 0.24.1", -] - [[package]] name = "hyper-rustls" version = "0.27.5" @@ -5306,7 +5373,7 @@ dependencies = [ "rustls 0.23.25", "rustls-pki-types", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", "tower-service", "webpki-roots 0.26.8", ] @@ -5601,18 +5668,6 @@ dependencies = [ "serde", ] -[[package]] -name = "indicatif" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" -dependencies = [ - "console", - "lazy_static", - "number_prefix 0.3.0", - "regex", -] - [[package]] name = "indicatif" version = "0.17.11" @@ -5620,7 +5675,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ "console", - "number_prefix 0.4.0", + "number_prefix", "portable-atomic", "rayon", "unicode-width 0.2.0", @@ -6209,7 +6264,9 @@ checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" dependencies = [ "cc", "libc", + "libssh2-sys", "libz-sys", + "openssl-sys", "pkg-config", ] @@ -6250,6 +6307,20 @@ dependencies = [ "redox_syscall 0.5.10", ] +[[package]] +name = "libssh2-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + [[package]] name = "libz-sys" version = "1.1.22" @@ -6756,7 +6827,7 @@ dependencies = [ "indexmap 2.8.0", "mistralrs-core", "rand 0.9.1", - "reqwest 0.12.15", + "reqwest", "serde", "serde_json", "tokio", @@ -6793,7 +6864,7 @@ dependencies = [ "hf-hub", "image", "indexmap 2.8.0", - "indicatif 0.17.11", + "indicatif", "interprocess", "itertools 0.13.0", "llguidance", @@ -6812,7 +6883,7 @@ dependencies = [ "rayon", "regex", "regex-automata 0.4.9", - "reqwest 0.12.15", + "reqwest", "rustc-hash 2.1.1", "safetensors", "schemars", @@ -7451,12 +7522,6 @@ dependencies = [ "libc", ] -[[package]] -name = "number_prefix" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" - [[package]] name = "number_prefix" version = "0.4.0" @@ -7802,6 +7867,28 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "openssl-src" +version = "300.4.2+3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168ce4e058f975fe43e89d9ccf78ca668601887ae736090aacc23ae353c298e2" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb61ea9811cc39e3c2069f40b8b8e2e70d8569b361f879786cc7ed48b777cdd" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + [[package]] name = "opentelemetry" version = "0.18.0" @@ -7836,7 +7923,7 @@ dependencies = [ "bytes", "http 1.3.1", "opentelemetry 0.29.1", - "reqwest 0.12.15", + "reqwest", "tracing", ] @@ -7870,7 +7957,7 @@ dependencies = [ "opentelemetry-proto", "opentelemetry_sdk 0.29.0", "prost", - "reqwest 0.12.15", + "reqwest", "thiserror 2.0.12", "tokio", "tonic", @@ -8403,7 +8490,7 @@ dependencies = [ "indexmap 2.8.0", "quick-xml 0.32.0", "serde", - "time 0.3.41", + "time", ] [[package]] @@ -8910,15 +8997,6 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" -[[package]] -name = "quick-xml" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26aab6b48e2590e4a64d1ed808749ba06257882b461d01ca71baeb747074a6dd" -dependencies = [ - "memchr", -] - [[package]] name = "quick-xml" version = "0.30.0" @@ -9316,7 +9394,7 @@ dependencies = [ "serde_json", "sha2", "thiserror 1.0.69", - "time 0.3.41", + "time", "url", "uuid 1.16.0", "web-sys", @@ -9378,7 +9456,7 @@ dependencies = [ "cargo_metadata 0.18.1", "glob", "sha2", - "time 0.3.41", + "time", "unindent", "walkdir", ] @@ -10408,7 +10486,7 @@ dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", "sublime_fuzzy", - "time 0.3.41", + "time", "url", ] @@ -11063,47 +11141,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b30a45b0cd0bcca8037f3d0dc3421eaf95327a17cad11964fb8179b4fc4832" -[[package]] -name = "reqwest" -version = "0.11.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" -dependencies = [ - "base64 0.21.7", - "bytes", - "encoding_rs", - "futures-core", - "futures-util", - "h2 0.3.26", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.32", - "hyper-rustls 0.24.2", - "ipnet", - "js-sys", - "log", - "mime", - "once_cell", - "percent-encoding", - "pin-project-lite", - "rustls 0.21.12", - "rustls-pemfile 1.0.4", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper 0.1.2", - "system-configuration 0.5.1", - "tokio", - "tokio-rustls 0.24.1", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "webpki-roots 0.25.4", - "winreg", -] - [[package]] name = "reqwest" version = "0.12.15" @@ -11121,7 +11158,7 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.6.0", - "hyper-rustls 0.27.5", + "hyper-rustls", "hyper-util", "ipnet", "js-sys", @@ -11137,10 +11174,10 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "sync_wrapper 1.0.2", - "system-configuration 0.6.1", + "sync_wrapper", + "system-configuration", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", "tokio-util", "tower 0.5.2", "tower-service", @@ -11162,7 +11199,7 @@ dependencies = [ "anyhow", "async-trait", "http 1.3.1", - "reqwest 0.12.15", + "reqwest", "serde", "thiserror 1.0.69", "tower-service", @@ -11181,7 +11218,7 @@ dependencies = [ "document-features", "env_filter", "indexmap 2.8.0", - "indicatif 0.17.11", + "indicatif", "itertools 0.14.0", "log", "puffin", @@ -11512,7 +11549,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "semver 1.0.26", + "semver", ] [[package]] @@ -11611,18 +11648,6 @@ dependencies = [ "webpki", ] -[[package]] -name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring 0.17.14", - "rustls-webpki 0.101.7", - "sct", -] - [[package]] name = "rustls" version = "0.23.25" @@ -11716,16 +11741,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" -[[package]] -name = "rustls-webpki" -version = "0.101.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" -dependencies = [ - "ring 0.17.14", - "untrusted 0.9.0", -] - [[package]] name = "rustls-webpki" version = "0.102.8" @@ -11975,32 +11990,26 @@ dependencies = [ [[package]] name = "self_update" -version = "0.27.0" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb85f1802f7b987237b8525c0fde86ea86f31c957c1875467c727d5b921179c" +checksum = "d832c086ece0dacc29fb2947bb4219b8f6e12fe9e40b7108f9e57c4224e47b5c" dependencies = [ "either", "flate2", - "hyper 0.14.32", - "indicatif 0.15.0", + "hyper 1.6.0", + "indicatif", "log", - "quick-xml 0.20.0", + "quick-xml 0.37.2", "regex", - "reqwest 0.11.27", - "semver 0.11.0", + "reqwest", + "self-replace", + "semver", "serde_json", "tar", "tempfile", - "zip 0.5.13", -] - -[[package]] -name = "semver" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", + "urlencoding", + "zip 2.4.2", + "zipsign-api", ] [[package]] @@ -12012,15 +12021,6 @@ dependencies = [ "serde", ] -[[package]] -name = "semver-parser" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9900206b54a3527fdc7b8a938bffd94a568bac4f4aa8113b209df75a09c0dec2" -dependencies = [ - "pest", -] - [[package]] name = "seq-macro" version = "0.3.6" @@ -12187,7 +12187,7 @@ dependencies = [ "serde_derive", "serde_json", "serde_with_macros", - "time 0.3.41", + "time", ] [[package]] @@ -12985,12 +12985,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync_wrapper" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" - [[package]] name = "sync_wrapper" version = "1.0.2" @@ -13089,17 +13083,6 @@ dependencies = [ "windows 0.57.0", ] -[[package]] -name = "system-configuration" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" -dependencies = [ - "bitflags 1.3.2", - "core-foundation 0.9.4", - "system-configuration-sys 0.5.0", -] - [[package]] name = "system-configuration" version = "0.6.1" @@ -13108,17 +13091,7 @@ checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ "bitflags 2.9.0", "core-foundation 0.9.4", - "system-configuration-sys 0.6.0", -] - -[[package]] -name = "system-configuration-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" -dependencies = [ - "core-foundation-sys", - "libc", + "system-configuration-sys", ] [[package]] @@ -13334,17 +13307,6 @@ dependencies = [ "weezl", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi 0.3.9", -] - [[package]] name = "time" version = "0.3.41" @@ -13470,7 +13432,7 @@ dependencies = [ "pin-project-lite", "thiserror 2.0.12", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", ] [[package]] @@ -13507,7 +13469,7 @@ dependencies = [ "derive_builder", "esaxx-rs", "getrandom 0.2.15", - "indicatif 0.17.11", + "indicatif", "itertools 0.13.0", "lazy_static", "log", @@ -13568,16 +13530,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.2" @@ -13712,7 +13664,7 @@ dependencies = [ "rustls-pemfile 2.2.0", "socket2 0.5.8", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", "tokio-stream", "tower 0.4.13", "tower-layer", @@ -13794,7 +13746,7 @@ dependencies = [ "futures-core", "futures-util", "pin-project-lite", - "sync_wrapper 1.0.2", + "sync_wrapper", "tokio", "tower-layer", "tower-service", @@ -14444,7 +14396,7 @@ dependencies = [ "image", "log", "lru", - "reqwest 0.12.15", + "reqwest", "reqwest-middleware", "thiserror 2.0.12", "tokio", @@ -14460,12 +14412,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -14756,12 +14702,6 @@ dependencies = [ "webpki", ] -[[package]] -name = "webpki-roots" -version = "0.25.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" - [[package]] name = "webpki-roots" version = "0.26.8" @@ -15635,16 +15575,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.50.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" -dependencies = [ - "cfg-if 1.0.0", - "windows-sys 0.48.0", -] - [[package]] name = "wit-bindgen-rt" version = "0.39.0" @@ -15754,7 +15684,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror 1.0.69", - "time 0.3.41", + "time", ] [[package]] @@ -16372,7 +16302,7 @@ dependencies = [ "rustls 0.23.25", "rustls-webpki 0.102.8", "serde", - "time 0.3.41", + "time", "tokio", "tokio-util", "tracing", @@ -16423,7 +16353,7 @@ dependencies = [ "rustls-pki-types", "rustls-webpki 0.102.8", "secrecy", - "time 0.3.41", + "time", "tokio", "tokio-util", "tracing", @@ -16508,10 +16438,10 @@ dependencies = [ "rustls-webpki 0.102.8", "secrecy", "socket2 0.5.8", - "time 0.3.41", + "time", "tls-listener", "tokio", - "tokio-rustls 0.26.2", + "tokio-rustls", "tokio-util", "tracing", "webpki-roots 0.26.8", @@ -16994,29 +16924,44 @@ dependencies = [ [[package]] name = "zip" -version = "0.5.13" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93ab48844d61251bb3835145c521d88aa4031d7139e8485990f60ca911fa0815" +checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164" dependencies = [ - "byteorder", + "arbitrary", "crc32fast", + "crossbeam-utils", + "displaydoc", + "indexmap 2.8.0", + "num_enum", "thiserror 1.0.69", - "time 0.1.45", ] [[package]] name = "zip" -version = "1.1.4" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" dependencies = [ "arbitrary", "crc32fast", "crossbeam-utils", "displaydoc", "indexmap 2.8.0", - "num_enum", - "thiserror 1.0.69", + "memchr", + "thiserror 2.0.12", + "time", +] + +[[package]] +name = "zipsign-api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dba6063ff82cdbd9a765add16d369abe81e520f836054e997c2db217ceca40c0" +dependencies = [ + "base64 0.22.1", + "ed25519-dalek", + "thiserror 2.0.12", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 93f68c33..5080b5a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ dora-metrics = { version = "0.3.11", path = "libraries/extensions/telemetry/metr dora-download = { version = "0.3.11", path = "libraries/extensions/download" } shared-memory-server = { version = "0.3.11", path = "libraries/shared-memory-server" } communication-layer-request-reply = { version = "0.3.11", path = "libraries/communication-layer/request-reply" } +dora-cli = { version = "0.3.11", path = "binaries/cli" } dora-runtime = { version = "0.3.11", path = "binaries/runtime" } dora-daemon = { version = "0.3.11", path = "binaries/daemon" } dora-coordinator = { version = "0.3.11", path = "binaries/coordinator" } @@ -79,7 +80,7 @@ dora-ros2-bridge = { version = "0.3.11", path = "libraries/extensions/ros2-bridg dora-ros2-bridge-msg-gen = { version = "0.3.11", path = "libraries/extensions/ros2-bridge/msg-gen" } dora-ros2-bridge-python = { path = "libraries/extensions/ros2-bridge/python" } # versioned independently from the other dora crates -dora-message = { version = "0.4.4", path = "libraries/message" } +dora-message = { version = "0.5.0-alpha", path = "libraries/message" } arrow = { version = "54.2.1" } arrow-schema = { version = "54.2.1" } arrow-data = { version = "54.2.1" } @@ -91,6 +92,8 @@ pyo3 = { version = "0.23", features = [ "multiple-pymethods", ] } pythonize = "0.23" +git2 = { version = "0.18.0", features = ["vendored-openssl"] } +serde_yaml = "0.9.33" [package] name = "dora-examples" @@ -107,6 +110,7 @@ ros2-examples = [] [dev-dependencies] eyre = "0.6.8" tokio = "1.24.2" +dora-cli = { workspace = true } dora-coordinator = { workspace = true } dora-core = { workspace = true } dora-message = { workspace = true } @@ -135,6 +139,10 @@ path = "examples/vlm/run.rs" name = "rust-dataflow" path = "examples/rust-dataflow/run.rs" +[[example]] +name = "rust-dataflow-git" +path = "examples/rust-dataflow-git/run.rs" + [[example]] name = "rust-ros2-dataflow" path = "examples/rust-ros2-dataflow/run.rs" diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index 54ebff5c..f03a4036 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -21,10 +21,10 @@ dora-node-api = { workspace = true } dora-operator-api-python = { workspace = true } pyo3.workspace = true eyre = "0.6" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } flume = "0.10.14" dora-runtime = { workspace = true, features = ["tracing", "metrics", "python"] } -dora-daemon = { workspace = true } +dora-cli = { workspace = true } dora-download = { workspace = true } arrow = { workspace = true, features = ["pyarrow"] } pythonize = { workspace = true } @@ -33,6 +33,9 @@ dora-ros2-bridge-python = { workspace = true } # pyo3_special_method_derive = "0.4.2" tokio = { version = "1.24.2", features = ["rt"] } +[build-dependencies] +pyo3-build-config = "0.23" + [lib] name = "dora" crate-type = ["cdylib"] diff --git a/apis/python/node/build.rs b/apis/python/node/build.rs new file mode 100644 index 00000000..dace4a9b --- /dev/null +++ b/apis/python/node/build.rs @@ -0,0 +1,3 @@ +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} diff --git a/apis/python/node/pyproject.toml b/apis/python/node/pyproject.toml index 33048a3f..8636df49 100644 --- a/apis/python/node/pyproject.toml +++ b/apis/python/node/pyproject.toml @@ -22,3 +22,11 @@ extend-select = [ "D", # pydocstyle "UP", ] + +[tool.maturin.target.x86_64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" + +[tool.maturin.target.aarch64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index e2a249a9..18e70c3e 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -6,7 +6,6 @@ use std::sync::Arc; use std::time::Duration; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; -use dora_daemon::Daemon; use dora_download::download_file; use dora_node_api::dora_core::config::NodeId; use dora_node_api::dora_core::descriptor::source_is_url; @@ -231,7 +230,7 @@ impl Node { /// :rtype: dict pub fn dataflow_descriptor(&mut self, py: Python) -> eyre::Result { Ok( - pythonize::pythonize(py, &self.node.get_mut().dataflow_descriptor()) + pythonize::pythonize(py, &self.node.get_mut().dataflow_descriptor()?) .map(|x| x.unbind())?, ) } @@ -382,19 +381,7 @@ pub fn resolve_dataflow(dataflow: String) -> eyre::Result { #[pyfunction] #[pyo3(signature = (dataflow_path, uv=None))] pub fn run(dataflow_path: String, uv: Option) -> eyre::Result<()> { - let dataflow_path = resolve_dataflow(dataflow_path).context("could not resolve dataflow")?; - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .context("tokio runtime failed")?; - let result = rt.block_on(Daemon::run_dataflow(&dataflow_path, uv.unwrap_or_default()))?; - match result.is_ok() { - true => Ok(()), - false => Err(eyre::eyre!( - "Dataflow failed to run with error: {:?}", - result.node_results - )), - } + dora_cli::command::run(dataflow_path, uv.unwrap_or_default()) } #[pymodule] diff --git a/apis/python/operator/Cargo.toml b/apis/python/operator/Cargo.toml index a96c5987..a65a929d 100644 --- a/apis/python/operator/Cargo.toml +++ b/apis/python/operator/Cargo.toml @@ -14,7 +14,7 @@ repository.workspace = true dora-node-api = { workspace = true } pyo3 = { workspace = true, features = ["eyre", "abi3-py37"] } eyre = "0.6" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } flume = "0.10.14" arrow = { workspace = true, features = ["pyarrow"] } arrow-schema = { workspace = true } diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index a96256f0..d1485b4b 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -17,7 +17,7 @@ dora-core = { workspace = true } dora-message = { workspace = true } shared-memory-server = { workspace = true } eyre = "0.6.7" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } tracing = "0.1.33" flume = "0.10.14" bincode = "1.3.3" diff --git a/apis/rust/node/src/node/mod.rs b/apis/rust/node/src/node/mod.rs index 47890d46..7b4a109c 100644 --- a/apis/rust/node/src/node/mod.rs +++ b/apis/rust/node/src/node/mod.rs @@ -60,7 +60,7 @@ pub struct DoraNode { drop_stream: DropStream, cache: VecDeque, - dataflow_descriptor: Descriptor, + dataflow_descriptor: serde_yaml::Result, warned_unknown_output: BTreeSet, _rt: TokioRuntime, } @@ -158,10 +158,9 @@ impl DoraNode { ), }; - let id = format!("{}/{}", dataflow_id, node_id); - #[cfg(feature = "metrics")] { + let id = format!("{}/{}", dataflow_id, node_id); let monitor_task = async move { if let Err(e) = run_metrics_monitor(id.clone()) .await @@ -200,7 +199,7 @@ impl DoraNode { sent_out_shared_memory: HashMap::new(), drop_stream, cache: VecDeque::new(), - dataflow_descriptor, + dataflow_descriptor: serde_yaml::from_value(dataflow_descriptor), warned_unknown_output: BTreeSet::new(), _rt: rt, }; @@ -449,8 +448,15 @@ impl DoraNode { /// Returns the full dataflow descriptor that this node is part of. /// /// This method returns the parsed dataflow YAML file. - pub fn dataflow_descriptor(&self) -> &Descriptor { - &self.dataflow_descriptor + pub fn dataflow_descriptor(&self) -> eyre::Result<&Descriptor> { + match &self.dataflow_descriptor { + Ok(d) => Ok(d), + Err(err) => eyre::bail!( + "failed to parse dataflow descriptor: {err}\n\n + This might be caused by mismatched version numbers of dora \ + daemon and the dora node API" + ), + } } } diff --git a/binaries/cli/Cargo.toml b/binaries/cli/Cargo.toml index 6fd7e1ce..7aa97db6 100644 --- a/binaries/cli/Cargo.toml +++ b/binaries/cli/Cargo.toml @@ -27,7 +27,7 @@ dora-node-api-c = { workspace = true } dora-operator-api-c = { workspace = true } dora-download = { workspace = true } serde = { version = "1.0.136", features = ["derive"] } -serde_yaml = "0.9.11" +serde_yaml = { workspace = true } webbrowser = "0.8.3" serde_json = "1.0.86" termcolor = "1.1.3" @@ -37,6 +37,7 @@ communication-layer-request-reply = { workspace = true } notify = "5.1.0" ctrlc = "3.2.5" tracing = "0.1.36" +tracing-log = "0.2.0" dora-tracing = { workspace = true, optional = true } bat = "0.24.0" dora-daemon = { workspace = true } @@ -50,7 +51,7 @@ tabwriter = "1.4.0" log = { version = "0.4.21", features = ["serde"] } colored = "2.1.0" env_logger = "0.11.3" -self_update = { version = "0.27.0", features = [ +self_update = { version = "0.42.0", features = [ "rustls", "archive-zip", "archive-tar", @@ -61,7 +62,11 @@ pyo3 = { workspace = true, features = [ "abi3", ], optional = true } self-replace = "1.5.0" +dunce = "1.0.5" +git2 = { workspace = true } +[build-dependencies] +pyo3-build-config = "0.23" [lib] name = "dora_cli" diff --git a/binaries/cli/build.rs b/binaries/cli/build.rs index 81caa36d..3672c16f 100644 --- a/binaries/cli/build.rs +++ b/binaries/cli/build.rs @@ -1,4 +1,5 @@ fn main() { + pyo3_build_config::add_extension_module_link_args(); println!( "cargo:rustc-env=TARGET={}", std::env::var("TARGET").unwrap() diff --git a/binaries/cli/pyproject.toml b/binaries/cli/pyproject.toml index 1ef4af39..c2d52457 100644 --- a/binaries/cli/pyproject.toml +++ b/binaries/cli/pyproject.toml @@ -15,6 +15,14 @@ features = ["python", "pyo3/extension-module"] [tool.ruff.lint] extend-select = [ - "D", # pydocstyle - "UP" + "D", # pydocstyle + "UP", ] + +[tool.maturin.target.x86_64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" + +[tool.maturin.target.aarch64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" diff --git a/binaries/cli/src/commands/daemon.rs b/binaries/cli/src/commands/daemon.rs index a6350ea3..c4aa6ca7 100644 --- a/binaries/cli/src/commands/daemon.rs +++ b/binaries/cli/src/commands/daemon.rs @@ -1,9 +1,10 @@ use super::Executable; -use crate::common::handle_dataflow_result; +use crate::{common::handle_dataflow_result, session::DataflowSession}; use dora_core::topics::{ DORA_COORDINATOR_PORT_DEFAULT, DORA_DAEMON_LOCAL_LISTEN_PORT_DEFAULT, LOCALHOST, }; +use dora_daemon::LogDestination; #[cfg(feature = "tracing")] use dora_tracing::TracingBuilder; @@ -62,24 +63,29 @@ impl Executable for Daemon { .build() .context("tokio runtime failed")?; rt.block_on(async { - match self.run_dataflow { - Some(dataflow_path) => { - tracing::info!("Starting dataflow `{}`", dataflow_path.display()); - if self.coordinator_addr != LOCALHOST { - tracing::info!( - "Not using coordinator addr {} as `run_dataflow` is for local dataflow only. Please use the `start` command for remote coordinator", - self.coordinator_addr - ); - } + match self.run_dataflow { + Some(dataflow_path) => { + tracing::info!("Starting dataflow `{}`", dataflow_path.display()); + if self.coordinator_addr != LOCALHOST { + tracing::info!( + "Not using coordinator addr {} as `run_dataflow` is for local dataflow only. Please use the `start` command for remote coordinator", + self.coordinator_addr + ); + } + let dataflow_session = + DataflowSession::read_session(&dataflow_path).context("failed to read DataflowSession")?; - let result = dora_daemon::Daemon::run_dataflow(&dataflow_path, false).await?; - handle_dataflow_result(result, None) - } - None => { - dora_daemon::Daemon::run(SocketAddr::new(self.coordinator_addr, self.coordinator_port), self.machine_id, self.local_listen_port).await + let result = dora_daemon::Daemon::run_dataflow(&dataflow_path, + dataflow_session.build_id, dataflow_session.local_build, dataflow_session.session_id, false, + LogDestination::Tracing, + ).await?; + handle_dataflow_result(result, None) + } + None => { + dora_daemon::Daemon::run(SocketAddr::new(self.coordinator_addr, self.coordinator_port), self.machine_id, self.local_listen_port).await + } } - } - }) - .context("failed to run dora-daemon") + }) + .context("failed to run dora-daemon") } } diff --git a/binaries/cli/src/commands/run.rs b/binaries/cli/src/commands/run.rs index e35baeaa..50cfd78a 100644 --- a/binaries/cli/src/commands/run.rs +++ b/binaries/cli/src/commands/run.rs @@ -1,6 +1,10 @@ use super::Executable; -use crate::common::{handle_dataflow_result, resolve_dataflow}; -use dora_daemon::Daemon; +use crate::{ + common::{handle_dataflow_result, resolve_dataflow}, + output::print_log_message, + session::DataflowSession, +}; +use dora_daemon::{flume, Daemon, LogDestination}; use dora_tracing::TracingBuilder; use eyre::Context; use tokio::runtime::Builder; @@ -32,11 +36,28 @@ impl Executable for Run { let dataflow_path = resolve_dataflow(self.dataflow).context("could not resolve dataflow")?; + let dataflow_session = DataflowSession::read_session(&dataflow_path) + .context("failed to read DataflowSession")?; let rt = Builder::new_multi_thread() .enable_all() .build() .context("tokio runtime failed")?; - let result = rt.block_on(Daemon::run_dataflow(&dataflow_path, self.uv))?; + + let (log_tx, log_rx) = flume::bounded(100); + std::thread::spawn(move || { + for message in log_rx { + print_log_message(message, false, false); + } + }); + + let result = rt.block_on(Daemon::run_dataflow( + &dataflow_path, + dataflow_session.build_id, + dataflow_session.local_build, + dataflow_session.session_id, + self.uv, + LogDestination::Channel { sender: log_tx }, + ))?; handle_dataflow_result(result, None) } } diff --git a/binaries/cli/src/commands/start.rs b/binaries/cli/src/commands/start.rs deleted file mode 100644 index fdee8709..00000000 --- a/binaries/cli/src/commands/start.rs +++ /dev/null @@ -1,127 +0,0 @@ -use super::{default_tracing, Executable}; -use crate::{ - attach::attach_dataflow, - common::{connect_to_coordinator, resolve_dataflow}, -}; -use communication_layer_request_reply::TcpRequestReplyConnection; -use dora_core::{ - descriptor::{Descriptor, DescriptorExt}, - topics::{DORA_COORDINATOR_PORT_CONTROL_DEFAULT, LOCALHOST}, -}; -use dora_message::{cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply}; -use eyre::{bail, Context}; -use std::{net::IpAddr, path::PathBuf}; -use uuid::Uuid; - -#[derive(Debug, clap::Args)] -/// Start the given dataflow path. Attach a name to the running dataflow by using --name. -pub struct Start { - /// Path to the dataflow descriptor file - #[clap(value_name = "PATH")] - dataflow: String, - /// Assign a name to the dataflow - #[clap(long)] - name: Option, - /// Address of the dora coordinator - #[clap(long, value_name = "IP", default_value_t = LOCALHOST)] - coordinator_addr: IpAddr, - /// Port number of the coordinator control server - #[clap(long, value_name = "PORT", default_value_t = DORA_COORDINATOR_PORT_CONTROL_DEFAULT)] - coordinator_port: u16, - /// Attach to the dataflow and wait for its completion - #[clap(long, action)] - attach: bool, - /// Run the dataflow in background - #[clap(long, action)] - detach: bool, - /// Enable hot reloading (Python only) - #[clap(long, action)] - hot_reload: bool, - // Use UV to run nodes. - #[clap(long, action)] - uv: bool, -} - -impl Executable for Start { - fn execute(self) -> eyre::Result<()> { - default_tracing()?; - let dataflow = resolve_dataflow(self.dataflow).context("could not resolve dataflow")?; - let dataflow_descriptor = - Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; - let working_dir = dataflow - .canonicalize() - .context("failed to canonicalize dataflow path")? - .parent() - .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? - .to_owned(); - - let coordinator_socket = (self.coordinator_addr, self.coordinator_port).into(); - let mut session = connect_to_coordinator(coordinator_socket) - .wrap_err("failed to connect to dora coordinator")?; - let dataflow_id = start_dataflow( - dataflow_descriptor.clone(), - self.name, - working_dir, - &mut *session, - self.uv, - )?; - - let attach = match (self.attach, self.detach) { - (true, true) => eyre::bail!("both `--attach` and `--detach` are given"), - (true, false) => true, - (false, true) => false, - (false, false) => { - println!("attaching to dataflow (use `--detach` to run in background)"); - true - } - }; - - if attach { - attach_dataflow( - dataflow_descriptor, - dataflow, - dataflow_id, - &mut *session, - self.hot_reload, - coordinator_socket, - env_logger::Builder::new() - .filter_level(log::LevelFilter::Info) - .parse_default_env() - .build() - .filter(), - )?; - } - Ok(()) - } -} - -fn start_dataflow( - dataflow: Descriptor, - name: Option, - local_working_dir: PathBuf, - session: &mut TcpRequestReplyConnection, - uv: bool, -) -> Result { - let reply_raw = session - .request( - &serde_json::to_vec(&ControlRequest::Start { - dataflow, - name, - local_working_dir, - uv, - }) - .unwrap(), - ) - .wrap_err("failed to send start dataflow message")?; - - let result: ControlRequestReply = - serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; - match result { - ControlRequestReply::DataflowStarted { uuid } => { - eprintln!("{uuid}"); - Ok(uuid) - } - ControlRequestReply::Error(err) => bail!("{err}"), - other => bail!("unexpected start dataflow reply: {other:?}"), - } -} diff --git a/binaries/cli/src/attach.rs b/binaries/cli/src/commands/start/attach.rs similarity index 82% rename from binaries/cli/src/attach.rs rename to binaries/cli/src/commands/start/attach.rs index ee5e441f..bdca8bfc 100644 --- a/binaries/cli/src/attach.rs +++ b/binaries/cli/src/commands/start/attach.rs @@ -1,4 +1,3 @@ -use colored::Colorize; use communication_layer_request_reply::{TcpConnection, TcpRequestReplyConnection}; use dora_core::descriptor::{resolve_path, CoreNodeKind, Descriptor, DescriptorExt}; use dora_message::cli_to_coordinator::ControlRequest; @@ -16,6 +15,7 @@ use tracing::{error, info}; use uuid::Uuid; use crate::common::handle_dataflow_result; +use crate::output::print_log_message; pub fn attach_dataflow( dataflow: Descriptor, @@ -33,6 +33,8 @@ pub fn attach_dataflow( let nodes = dataflow.resolve_aliases_and_set_defaults()?; + let print_daemon_name = nodes.values().any(|n| n.deploy.is_some()); + let working_dir = dataflow_path .canonicalize() .context("failed to canonicalize dataflow path")? @@ -155,39 +157,7 @@ pub fn attach_dataflow( }, Ok(AttachEvent::Control(control_request)) => control_request, Ok(AttachEvent::Log(Ok(log_message))) => { - let LogMessage { - dataflow_id, - node_id, - daemon_id, - level, - target, - module_path: _, - file: _, - line: _, - message, - } = log_message; - let level = match level { - log::Level::Error => "ERROR".red(), - log::Level::Warn => "WARN ".yellow(), - log::Level::Info => "INFO ".green(), - other => format!("{other:5}").normal(), - }; - let dataflow = format!(" dataflow `{dataflow_id}`").cyan(); - let daemon = match daemon_id { - Some(id) => format!(" on daemon `{id}`"), - None => " on default daemon".to_string(), - } - .bright_black(); - let node = match node_id { - Some(node_id) => format!(" {node_id}").bold(), - None => "".normal(), - }; - let target = match target { - Some(target) => format!(" {target}").dimmed(), - None => "".normal(), - }; - - println!("{level}{dataflow}{daemon}{node}{target}: {message}"); + print_log_message(log_message, false, print_daemon_name); continue; } Ok(AttachEvent::Log(Err(err))) => { @@ -202,7 +172,7 @@ pub fn attach_dataflow( let result: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; match result { - ControlRequestReply::DataflowStarted { uuid: _ } => (), + ControlRequestReply::DataflowSpawned { uuid: _ } => (), ControlRequestReply::DataflowStopped { uuid, result } => { info!("dataflow {uuid} stopped"); break handle_dataflow_result(result, Some(uuid)); diff --git a/binaries/cli/src/commands/start/mod.rs b/binaries/cli/src/commands/start/mod.rs new file mode 100644 index 00000000..bdc9f6d0 --- /dev/null +++ b/binaries/cli/src/commands/start/mod.rs @@ -0,0 +1,200 @@ +use super::{default_tracing, Executable}; +use crate::{ + commands::start::attach::attach_dataflow, + common::{connect_to_coordinator, local_working_dir, resolve_dataflow}, + output::print_log_message, + session::DataflowSession, +}; +use communication_layer_request_reply::{TcpConnection, TcpRequestReplyConnection}; +use dora_core::{ + descriptor::{Descriptor, DescriptorExt}, + topics::{DORA_COORDINATOR_PORT_CONTROL_DEFAULT, LOCALHOST}, +}; +use dora_message::{ + cli_to_coordinator::ControlRequest, common::LogMessage, coordinator_to_cli::ControlRequestReply, +}; +use eyre::{bail, Context}; +use std::{ + net::{IpAddr, SocketAddr, TcpStream}, + path::PathBuf, +}; +use uuid::Uuid; + +mod attach; + +#[derive(Debug, clap::Args)] +/// Start the given dataflow path. Attach a name to the running dataflow by using --name. +pub struct Start { + /// Path to the dataflow descriptor file + #[clap(value_name = "PATH")] + dataflow: String, + /// Assign a name to the dataflow + #[clap(long)] + name: Option, + /// Address of the dora coordinator + #[clap(long, value_name = "IP", default_value_t = LOCALHOST)] + coordinator_addr: IpAddr, + /// Port number of the coordinator control server + #[clap(long, value_name = "PORT", default_value_t = DORA_COORDINATOR_PORT_CONTROL_DEFAULT)] + coordinator_port: u16, + /// Attach to the dataflow and wait for its completion + #[clap(long, action)] + attach: bool, + /// Run the dataflow in background + #[clap(long, action)] + detach: bool, + /// Enable hot reloading (Python only) + #[clap(long, action)] + hot_reload: bool, + // Use UV to run nodes. + #[clap(long, action)] + uv: bool, +} + +impl Executable for Start { + fn execute(self) -> eyre::Result<()> { + default_tracing()?; + let coordinator_socket = (self.coordinator_addr, self.coordinator_port).into(); + + let (dataflow, dataflow_descriptor, mut session, dataflow_id) = + start_dataflow(self.dataflow, self.name, coordinator_socket, self.uv)?; + + let attach = match (self.attach, self.detach) { + (true, true) => eyre::bail!("both `--attach` and `--detach` are given"), + (true, false) => true, + (false, true) => false, + (false, false) => { + println!("attaching to dataflow (use `--detach` to run in background)"); + true + } + }; + + if attach { + let log_level = env_logger::Builder::new() + .filter_level(log::LevelFilter::Info) + .parse_default_env() + .build() + .filter(); + + attach_dataflow( + dataflow_descriptor, + dataflow, + dataflow_id, + &mut *session, + self.hot_reload, + coordinator_socket, + log_level, + ) + } else { + let print_daemon_name = dataflow_descriptor.nodes.iter().any(|n| n.deploy.is_some()); + // wait until dataflow is started + wait_until_dataflow_started( + dataflow_id, + &mut session, + coordinator_socket, + log::LevelFilter::Info, + print_daemon_name, + ) + } + } +} + +fn start_dataflow( + dataflow: String, + name: Option, + coordinator_socket: SocketAddr, + uv: bool, +) -> Result<(PathBuf, Descriptor, Box, Uuid), eyre::Error> { + let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; + let dataflow_descriptor = + Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; + let dataflow_session = + DataflowSession::read_session(&dataflow).context("failed to read DataflowSession")?; + + let mut session = connect_to_coordinator(coordinator_socket) + .wrap_err("failed to connect to dora coordinator")?; + + let local_working_dir = local_working_dir(&dataflow, &dataflow_descriptor, &mut *session)?; + + let dataflow_id = { + let dataflow = dataflow_descriptor.clone(); + let session: &mut TcpRequestReplyConnection = &mut *session; + let reply_raw = session + .request( + &serde_json::to_vec(&ControlRequest::Start { + build_id: dataflow_session.build_id, + session_id: dataflow_session.session_id, + dataflow, + name, + local_working_dir, + uv, + }) + .unwrap(), + ) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowStartTriggered { uuid } => { + eprintln!("dataflow start triggered: {uuid}"); + uuid + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } + }; + Ok((dataflow, dataflow_descriptor, session, dataflow_id)) +} + +fn wait_until_dataflow_started( + dataflow_id: Uuid, + session: &mut Box, + coordinator_addr: SocketAddr, + log_level: log::LevelFilter, + print_daemon_id: bool, +) -> eyre::Result<()> { + // subscribe to log messages + let mut log_session = TcpConnection { + stream: TcpStream::connect(coordinator_addr) + .wrap_err("failed to connect to dora coordinator")?, + }; + log_session + .send( + &serde_json::to_vec(&ControlRequest::LogSubscribe { + dataflow_id, + level: log_level, + }) + .wrap_err("failed to serialize message")?, + ) + .wrap_err("failed to send log subscribe request to coordinator")?; + std::thread::spawn(move || { + while let Ok(raw) = log_session.receive() { + let parsed: eyre::Result = + serde_json::from_slice(&raw).context("failed to parse log message"); + match parsed { + Ok(log_message) => { + print_log_message(log_message, false, print_daemon_id); + } + Err(err) => { + tracing::warn!("failed to parse log message: {err:?}") + } + } + } + }); + + let reply_raw = session + .request(&serde_json::to_vec(&ControlRequest::WaitForSpawn { dataflow_id }).unwrap()) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowSpawned { uuid } => { + eprintln!("dataflow started: {uuid}"); + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } + Ok(()) +} diff --git a/binaries/cli/src/common.rs b/binaries/cli/src/common.rs index 2cc71564..26431349 100644 --- a/binaries/cli/src/common.rs +++ b/binaries/cli/src/common.rs @@ -1,13 +1,17 @@ use crate::formatting::FormatDataflowError; use communication_layer_request_reply::{RequestReplyLayer, TcpLayer, TcpRequestReplyConnection}; -use dora_core::descriptor::source_is_url; +use dora_core::descriptor::{source_is_url, Descriptor}; use dora_download::download_file; use dora_message::{ cli_to_coordinator::ControlRequest, coordinator_to_cli::{ControlRequestReply, DataflowList, DataflowResult}, }; -use eyre::{bail, Context}; -use std::{env::current_dir, net::SocketAddr, path::PathBuf}; +use eyre::{bail, Context, ContextCompat}; +use std::{ + env::current_dir, + net::SocketAddr, + path::{Path, PathBuf}, +}; use tokio::runtime::Builder; use uuid::Uuid; @@ -67,3 +71,45 @@ pub(crate) fn resolve_dataflow(dataflow: String) -> eyre::Result { }; Ok(dataflow) } + +pub(crate) fn local_working_dir( + dataflow_path: &Path, + dataflow_descriptor: &Descriptor, + coordinator_session: &mut TcpRequestReplyConnection, +) -> eyre::Result> { + Ok( + if dataflow_descriptor + .nodes + .iter() + .all(|n| n.deploy.as_ref().map(|d| d.machine.as_ref()).is_none()) + && cli_and_daemon_on_same_machine(coordinator_session)? + { + Some( + dunce::canonicalize(dataflow_path) + .context("failed to canonicalize dataflow file path")? + .parent() + .context("dataflow path has no parent dir")? + .to_owned(), + ) + } else { + None + }, + ) +} + +pub(crate) fn cli_and_daemon_on_same_machine(session: &mut TcpRequestReplyConnection) -> eyre::Result { + let reply_raw = session + .request(&serde_json::to_vec(&ControlRequest::CliAndDefaultDaemonOnSameMachine).unwrap()) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::CliAndDefaultDaemonIps { + default_daemon, + cli, + } => Ok(default_daemon.is_some() && default_daemon == cli), + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } +} diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index 6575324d..d1e026c8 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -5,10 +5,11 @@ use std::{ path::PathBuf, }; -mod attach; mod commands; mod common; mod formatting; +pub mod output; +pub mod session; mod template; const LOCALHOST: IpAddr = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)); diff --git a/binaries/cli/src/output.rs b/binaries/cli/src/output.rs new file mode 100644 index 00000000..ff5ba755 --- /dev/null +++ b/binaries/cli/src/output.rs @@ -0,0 +1,62 @@ +use colored::Colorize; +use dora_core::build::LogLevelOrStdout; +use dora_message::common::LogMessage; + +pub fn print_log_message( + log_message: LogMessage, + print_dataflow_id: bool, + print_daemon_name: bool, +) { + let LogMessage { + build_id: _, + dataflow_id, + node_id, + daemon_id, + level, + target, + module_path: _, + file: _, + line: _, + message, + } = log_message; + let level = match level { + LogLevelOrStdout::LogLevel(level) => match level { + log::Level::Error => "ERROR ".red(), + log::Level::Warn => "WARN ".yellow(), + log::Level::Info => "INFO ".green(), + log::Level::Debug => "DEBUG ".bright_blue(), + log::Level::Trace => "TRACE ".dimmed(), + }, + LogLevelOrStdout::Stdout => "stdout".bright_blue().italic().dimmed(), + }; + + let dataflow = match dataflow_id { + Some(dataflow_id) if print_dataflow_id => format!("dataflow `{dataflow_id}` ").cyan(), + _ => String::new().cyan(), + }; + let daemon = match daemon_id { + Some(id) if print_daemon_name => match id.machine_id() { + Some(machine_id) => format!("on daemon `{machine_id}`"), + None => "on default daemon ".to_string(), + }, + None if print_daemon_name => "on default daemon".to_string(), + _ => String::new(), + } + .bright_black(); + let colon = ":".bright_black().bold(); + let node = match node_id { + Some(node_id) => { + let node_id = node_id.to_string().dimmed().bold(); + let padding = if daemon.is_empty() { "" } else { " " }; + format!("{node_id}{padding}{daemon}{colon} ") + } + None if daemon.is_empty() => "".into(), + None => format!("{daemon}{colon} "), + }; + let target = match target { + Some(target) => format!("{target} ").dimmed(), + None => "".normal(), + }; + + println!("{node}{level} {target}{dataflow} {message}"); +} diff --git a/binaries/cli/src/session.rs b/binaries/cli/src/session.rs new file mode 100644 index 00000000..9a8ac5b8 --- /dev/null +++ b/binaries/cli/src/session.rs @@ -0,0 +1,98 @@ +use std::{ + collections::BTreeMap, + path::{Path, PathBuf}, +}; + +use dora_core::build::BuildInfo; +use dora_message::{common::GitSource, id::NodeId, BuildId, SessionId}; +use eyre::{Context, ContextCompat}; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct DataflowSession { + pub build_id: Option, + pub session_id: SessionId, + pub git_sources: BTreeMap, + pub local_build: Option, +} + +impl Default for DataflowSession { + fn default() -> Self { + Self { + build_id: None, + session_id: SessionId::generate(), + git_sources: Default::default(), + local_build: Default::default(), + } + } +} + +impl DataflowSession { + pub fn read_session(dataflow_path: &Path) -> eyre::Result { + let session_file = session_file_path(dataflow_path)?; + if session_file.exists() { + if let Ok(parsed) = deserialize(&session_file) { + return Ok(parsed); + } else { + tracing::warn!("failed to read dataflow session file, regenerating (you might need to run `dora build` again)"); + } + } + + let default_session = DataflowSession::default(); + default_session.write_out_for_dataflow(dataflow_path)?; + Ok(default_session) + } + + pub fn write_out_for_dataflow(&self, dataflow_path: &Path) -> eyre::Result<()> { + let session_file = session_file_path(dataflow_path)?; + let filename = session_file + .file_name() + .context("session file has no file name")? + .to_str() + .context("session file name is no utf8")?; + if let Some(parent) = session_file.parent() { + std::fs::create_dir_all(parent).context("failed to create out dir")?; + } + std::fs::write(&session_file, self.serialize()?) + .context("failed to write dataflow session file")?; + let gitignore = session_file.with_file_name(".gitignore"); + if gitignore.exists() { + let existing = + std::fs::read_to_string(&gitignore).context("failed to read gitignore")?; + if !existing + .lines() + .any(|l| l.split_once('/') == Some(("", filename))) + { + let new = existing + &format!("\n/{filename}\n"); + std::fs::write(gitignore, new).context("failed to update gitignore")?; + } + } else { + std::fs::write(gitignore, format!("/{filename}\n")) + .context("failed to write gitignore")?; + } + Ok(()) + } + + fn serialize(&self) -> eyre::Result { + serde_yaml::to_string(&self).context("failed to serialize dataflow session file") + } +} + +fn deserialize(session_file: &Path) -> eyre::Result { + std::fs::read_to_string(session_file) + .context("failed to read DataflowSession file") + .and_then(|s| { + serde_yaml::from_str(&s).context("failed to deserialize DataflowSession file") + }) +} + +fn session_file_path(dataflow_path: &Path) -> eyre::Result { + let file_stem = dataflow_path + .file_stem() + .wrap_err("dataflow path has no file stem")? + .to_str() + .wrap_err("dataflow file stem is not valid utf-8")?; + let session_file = dataflow_path + .with_file_name("out") + .join(format!("{file_stem}.dora-session.yaml")); + Ok(session_file) +} diff --git a/binaries/cli/src/template/c/cmake-template.txt b/binaries/cli/src/template/c/cmake-template.txt index 32cb561f..eafe50da 100644 --- a/binaries/cli/src/template/c/cmake-template.txt +++ b/binaries/cli/src/template/c/cmake-template.txt @@ -64,16 +64,16 @@ link_directories(${dora_link_dirs}) add_executable(talker_1 talker_1/node.c) add_dependencies(talker_1 Dora_c) target_include_directories(talker_1 PRIVATE ${dora_c_include_dir}) -target_link_libraries(talker_1 dora_node_api_c m) +target_link_libraries(talker_1 dora_node_api_c m z) add_executable(talker_2 talker_2/node.c) add_dependencies(talker_2 Dora_c) target_include_directories(talker_2 PRIVATE ${dora_c_include_dir}) -target_link_libraries(talker_2 dora_node_api_c m) +target_link_libraries(talker_2 dora_node_api_c m z) add_executable(listener_1 listener_1/node.c) add_dependencies(listener_1 Dora_c) target_include_directories(listener_1 PRIVATE ${dora_c_include_dir}) -target_link_libraries(listener_1 dora_node_api_c m) +target_link_libraries(listener_1 dora_node_api_c m z) -install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) \ No newline at end of file +install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) diff --git a/binaries/cli/src/template/cxx/cmake-template.txt b/binaries/cli/src/template/cxx/cmake-template.txt index bd3fe492..7f7ce865 100644 --- a/binaries/cli/src/template/cxx/cmake-template.txt +++ b/binaries/cli/src/template/cxx/cmake-template.txt @@ -70,16 +70,16 @@ link_directories(${dora_link_dirs}) add_executable(talker_1 talker_1/node.cc ${node_bridge}) add_dependencies(talker_1 Dora_cxx) target_include_directories(talker_1 PRIVATE ${dora_cxx_include_dir}) -target_link_libraries(talker_1 dora_node_api_cxx) +target_link_libraries(talker_1 dora_node_api_cxx z) add_executable(talker_2 talker_2/node.cc ${node_bridge}) add_dependencies(talker_2 Dora_cxx) target_include_directories(talker_2 PRIVATE ${dora_cxx_include_dir}) -target_link_libraries(talker_2 dora_node_api_cxx) +target_link_libraries(talker_2 dora_node_api_cxx z) add_executable(listener_1 listener_1/node.cc ${node_bridge}) add_dependencies(listener_1 Dora_cxx) target_include_directories(listener_1 PRIVATE ${dora_cxx_include_dir}) -target_link_libraries(listener_1 dora_node_api_cxx) +target_link_libraries(listener_1 dora_node_api_cxx z) -install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) \ No newline at end of file +install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) diff --git a/binaries/coordinator/src/control.rs b/binaries/coordinator/src/control.rs index f7cb23c1..446233a8 100644 --- a/binaries/coordinator/src/control.rs +++ b/binaries/coordinator/src/control.rs @@ -2,7 +2,9 @@ use crate::{ tcp_utils::{tcp_receive, tcp_send}, Event, }; -use dora_message::{cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply}; +use dora_message::{ + cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply, BuildId, +}; use eyre::{eyre, Context}; use futures::{ future::{self, Either}, @@ -79,6 +81,7 @@ async fn handle_requests( tx: mpsc::Sender, _finish_tx: mpsc::Sender<()>, ) { + let peer_addr = connection.peer_addr().ok(); loop { let next_request = tcp_receive(&mut connection).map(Either::Left); let coordinator_stopped = tx.closed().map(Either::Right); @@ -114,11 +117,29 @@ async fn handle_requests( break; } - let result = match request { + if let Ok(ControlRequest::BuildLogSubscribe { build_id, level }) = request { + let _ = tx + .send(ControlEvent::BuildLogSubscribe { + build_id, + level, + connection, + }) + .await; + break; + } + + let mut result = match request { Ok(request) => handle_request(request, &tx).await, Err(err) => Err(err), }; + if let Ok(ControlRequestReply::CliAndDefaultDaemonIps { cli, .. }) = &mut result { + if cli.is_none() { + // fill cli IP address in reply + *cli = peer_addr.map(|s| s.ip()); + } + } + let reply = result.unwrap_or_else(|err| ControlRequestReply::Error(format!("{err:?}"))); let serialized: Vec = match serde_json::to_vec(&reply).wrap_err("failed to serialize ControlRequestReply") { @@ -155,7 +176,7 @@ async fn handle_request( ) -> eyre::Result { let (reply_tx, reply_rx) = oneshot::channel(); let event = ControlEvent::IncomingRequest { - request, + request: request.clone(), reply_sender: reply_tx, }; @@ -165,7 +186,7 @@ async fn handle_request( reply_rx .await - .unwrap_or(Ok(ControlRequestReply::CoordinatorStopped)) + .wrap_err_with(|| format!("no coordinator reply to {request:?}"))? } #[derive(Debug)] @@ -179,6 +200,11 @@ pub enum ControlEvent { level: log::LevelFilter, connection: TcpStream, }, + BuildLogSubscribe { + build_id: BuildId, + level: log::LevelFilter, + connection: TcpStream, + }, Error(eyre::Report), } diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index e002f859..e3a8d767 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -5,22 +5,27 @@ use crate::{ pub use control::ControlEvent; use dora_core::{ config::{NodeId, OperatorId}, + descriptor::DescriptorExt, uhlc::{self, HLC}, }; use dora_message::{ cli_to_coordinator::ControlRequest, - common::DaemonId, + common::{DaemonId, GitSource}, coordinator_to_cli::{ ControlRequestReply, DataflowIdAndName, DataflowList, DataflowListEntry, DataflowResult, DataflowStatus, LogLevel, LogMessage, }, - coordinator_to_daemon::{DaemonCoordinatorEvent, RegisterResult, Timestamped}, + coordinator_to_daemon::{ + BuildDataflowNodes, DaemonCoordinatorEvent, RegisterResult, Timestamped, + }, daemon_to_coordinator::{DaemonCoordinatorReply, DataflowDaemonResult}, descriptor::{Descriptor, ResolvedNode}, + BuildId, DataflowId, SessionId, }; use eyre::{bail, eyre, ContextCompat, Result, WrapErr}; use futures::{future::join_all, stream::FuturesUnordered, Future, Stream, StreamExt}; use futures_concurrency::stream::Merge; +use itertools::Itertools; use log_subscriber::LogSubscriber; use run::SpawnedDataflow; use std::{ @@ -30,7 +35,11 @@ use std::{ sync::Arc, time::{Duration, Instant}, }; -use tokio::{net::TcpStream, sync::mpsc, task::JoinHandle}; +use tokio::{ + net::TcpStream, + sync::{mpsc, oneshot}, + task::JoinHandle, +}; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; use uuid::Uuid; @@ -135,6 +144,10 @@ impl DaemonConnections { } } + fn get(&self, id: &DaemonId) -> Option<&DaemonConnection> { + self.daemons.get(id) + } + fn get_mut(&mut self, id: &DaemonId) -> Option<&mut DaemonConnection> { self.daemons.get_mut(id) } @@ -157,10 +170,6 @@ impl DaemonConnections { self.daemons.keys() } - fn iter(&self) -> impl Iterator { - self.daemons.iter() - } - fn iter_mut(&mut self) -> impl Iterator { self.daemons.iter_mut() } @@ -194,13 +203,20 @@ async fn start_inner( let mut events = (abortable_events, daemon_events).merge(); - let mut running_dataflows: HashMap = HashMap::new(); - let mut dataflow_results: HashMap> = + let mut running_builds: HashMap = HashMap::new(); + let mut finished_builds: HashMap = HashMap::new(); + + let mut running_dataflows: HashMap = HashMap::new(); + let mut dataflow_results: HashMap> = HashMap::new(); - let mut archived_dataflows: HashMap = HashMap::new(); + let mut archived_dataflows: HashMap = HashMap::new(); let mut daemon_connections = DaemonConnections::default(); while let Some(event) = events.next().await { + // used below for measuring the event handling duration + let start = Instant::now(); + let event_kind = event.kind(); + if event.log() { tracing::trace!("Handling event {event:?}"); } @@ -347,12 +363,13 @@ async fn start_inner( let mut finished_dataflow = entry.remove(); let dataflow_id = finished_dataflow.uuid; send_log_message( - &mut finished_dataflow, + &mut finished_dataflow.log_subscribers, &LogMessage { - dataflow_id, + build_id: None, + dataflow_id: Some(dataflow_id), node_id: None, daemon_id: None, - level: LogLevel::Info, + level: LogLevel::Info.into(), target: Some("coordinator".into()), module_path: None, file: None, @@ -371,9 +388,15 @@ async fn start_inner( DataflowResult::ok_empty(uuid, clock.new_timestamp()) }), }; - for sender in finished_dataflow.reply_senders { + for sender in finished_dataflow.stop_reply_senders { let _ = sender.send(Ok(reply.clone())); } + if !matches!( + finished_dataflow.spawn_result, + CachedResult::Cached { .. } + ) { + log::error!("pending spawn result on dataflow finish"); + } } } std::collections::hash_map::Entry::Vacant(_) => { @@ -389,7 +412,54 @@ async fn start_inner( reply_sender, } => { match request { + ControlRequest::Build { + session_id, + dataflow, + git_sources, + prev_git_sources, + local_working_dir, + uv, + } => { + // assign a random build id + let build_id = BuildId::generate(); + + let result = build_dataflow( + build_id, + session_id, + dataflow, + git_sources, + prev_git_sources, + local_working_dir, + &clock, + uv, + &mut daemon_connections, + ) + .await; + match result { + Ok(build) => { + running_builds.insert(build_id, build); + let _ = reply_sender.send(Ok( + ControlRequestReply::DataflowBuildTriggered { build_id }, + )); + } + Err(err) => { + let _ = reply_sender.send(Err(err)); + } + } + } + ControlRequest::WaitForBuild { build_id } => { + if let Some(build) = running_builds.get_mut(&build_id) { + build.build_result.register(reply_sender); + } else if let Some(result) = finished_builds.get_mut(&build_id) { + result.register(reply_sender); + } else { + let _ = + reply_sender.send(Err(eyre!("unknown build id {build_id}"))); + } + } ControlRequest::Start { + build_id, + session_id, dataflow, name, local_working_dir, @@ -408,6 +478,8 @@ async fn start_inner( } } let dataflow = start_dataflow( + build_id, + session_id, dataflow, local_working_dir, name, @@ -418,16 +490,30 @@ async fn start_inner( .await?; Ok(dataflow) }; - let reply = inner.await.map(|dataflow| { - let uuid = dataflow.uuid; - running_dataflows.insert(uuid, dataflow); - ControlRequestReply::DataflowStarted { uuid } - }); - let _ = reply_sender.send(reply); + match inner.await { + Ok(dataflow) => { + let uuid = dataflow.uuid; + running_dataflows.insert(uuid, dataflow); + let _ = reply_sender.send(Ok( + ControlRequestReply::DataflowStartTriggered { uuid }, + )); + } + Err(err) => { + let _ = reply_sender.send(Err(err)); + } + } + } + ControlRequest::WaitForSpawn { dataflow_id } => { + if let Some(dataflow) = running_dataflows.get_mut(&dataflow_id) { + dataflow.spawn_result.register(reply_sender); + } else { + let _ = + reply_sender.send(Err(eyre!("unknown dataflow {dataflow_id}"))); + } } ControlRequest::Check { dataflow_uuid } => { let status = match &running_dataflows.get(&dataflow_uuid) { - Some(_) => ControlRequestReply::DataflowStarted { + Some(_) => ControlRequestReply::DataflowSpawned { uuid: dataflow_uuid, }, None => ControlRequestReply::DataflowStopped { @@ -495,7 +581,7 @@ async fn start_inner( match dataflow { Ok(dataflow) => { - dataflow.reply_senders.push(reply_sender); + dataflow.stop_reply_senders.push(reply_sender); } Err(err) => { let _ = reply_sender.send(Err(err)); @@ -528,7 +614,7 @@ async fn start_inner( match dataflow { Ok(dataflow) => { - dataflow.reply_senders.push(reply_sender); + dataflow.stop_reply_senders.push(reply_sender); } Err(err) => { let _ = reply_sender.send(Err(err)); @@ -626,6 +712,27 @@ async fn start_inner( "LogSubscribe request should be handled separately" ))); } + ControlRequest::BuildLogSubscribe { .. } => { + let _ = reply_sender.send(Err(eyre::eyre!( + "BuildLogSubscribe request should be handled separately" + ))); + } + ControlRequest::CliAndDefaultDaemonOnSameMachine => { + let mut default_daemon_ip = None; + if let Some(default_id) = daemon_connections.unnamed().next() { + if let Some(connection) = daemon_connections.get(default_id) { + if let Ok(addr) = connection.stream.peer_addr() { + default_daemon_ip = Some(addr.ip()); + } + } + } + let _ = reply_sender.send(Ok( + ControlRequestReply::CliAndDefaultDaemonIps { + default_daemon: default_daemon_ip, + cli: None, // filled later + }, + )); + } } } ControlEvent::Error(err) => tracing::error!("{err:?}"), @@ -640,6 +747,17 @@ async fn start_inner( .push(LogSubscriber::new(level, connection)); } } + ControlEvent::BuildLogSubscribe { + build_id, + level, + connection, + } => { + if let Some(build) = running_builds.get_mut(&build_id) { + build + .log_subscribers + .push(LogSubscriber::new(level, connection)); + } + } }, Event::DaemonHeartbeatInterval => { let mut disconnected = BTreeSet::new(); @@ -695,14 +813,89 @@ async fn start_inner( } } Event::Log(message) => { - if let Some(dataflow) = running_dataflows.get_mut(&message.dataflow_id) { - send_log_message(dataflow, &message).await; + if let Some(dataflow_id) = &message.dataflow_id { + if let Some(dataflow) = running_dataflows.get_mut(dataflow_id) { + send_log_message(&mut dataflow.log_subscribers, &message).await; + } + } + if let Some(build_id) = message.build_id { + if let Some(build) = running_builds.get_mut(&build_id) { + send_log_message(&mut build.log_subscribers, &message).await; + } } } Event::DaemonExit { daemon_id } => { tracing::info!("Daemon `{daemon_id}` exited"); daemon_connections.remove(&daemon_id); } + Event::DataflowBuildResult { + build_id, + daemon_id, + result, + } => match running_builds.get_mut(&build_id) { + Some(build) => { + build.pending_build_results.remove(&daemon_id); + match result { + Ok(()) => {} + Err(err) => { + build.errors.push(format!("{err:?}")); + } + }; + if build.pending_build_results.is_empty() { + tracing::info!("dataflow build finished: `{build_id}`"); + let mut build = running_builds.remove(&build_id).unwrap(); + let result = if build.errors.is_empty() { + Ok(()) + } else { + Err(format!("build failed: {}", build.errors.join("\n\n"))) + }; + + build.build_result.set_result(Ok( + ControlRequestReply::DataflowBuildFinished { build_id, result }, + )); + + finished_builds.insert(build_id, build.build_result); + } + } + None => { + tracing::warn!("received DataflowSpawnResult, but no matching dataflow in `running_dataflows` map"); + } + }, + Event::DataflowSpawnResult { + dataflow_id, + daemon_id, + result, + } => match running_dataflows.get_mut(&dataflow_id) { + Some(dataflow) => { + dataflow.pending_spawn_results.remove(&daemon_id); + match result { + Ok(()) => { + if dataflow.pending_spawn_results.is_empty() { + tracing::info!("successfully spawned dataflow `{dataflow_id}`",); + dataflow.spawn_result.set_result(Ok( + ControlRequestReply::DataflowSpawned { uuid: dataflow_id }, + )); + } + } + Err(err) => { + tracing::warn!("error while spawning dataflow `{dataflow_id}`"); + dataflow.spawn_result.set_result(Err(err)); + } + }; + } + None => { + tracing::warn!("received DataflowSpawnResult, but no matching dataflow in `running_dataflows` map"); + } + }, + } + + // warn if event handling took too long -> the main loop should never be blocked for too long + let elapsed = start.elapsed(); + if elapsed > Duration::from_millis(100) { + tracing::warn!( + "Coordinator took {}ms for handling event: {event_kind}", + elapsed.as_millis() + ); } } @@ -711,8 +904,8 @@ async fn start_inner( Ok(()) } -async fn send_log_message(dataflow: &mut RunningDataflow, message: &LogMessage) { - for subscriber in &mut dataflow.log_subscribers { +async fn send_log_message(log_subscribers: &mut Vec, message: &LogMessage) { + for subscriber in log_subscribers.iter_mut() { let send_result = tokio::time::timeout(Duration::from_millis(100), subscriber.send_message(message)); @@ -720,7 +913,7 @@ async fn send_log_message(dataflow: &mut RunningDataflow, message: &LogMessage) subscriber.close(); } } - dataflow.log_subscribers.retain(|s| !s.is_closed()); + log_subscribers.retain(|s| !s.is_closed()); } fn dataflow_result( @@ -787,6 +980,15 @@ async fn send_heartbeat_message( .wrap_err("failed to send heartbeat message to daemon") } +struct RunningBuild { + errors: Vec, + build_result: CachedResult, + + log_subscribers: Vec, + + pending_build_results: BTreeSet, +} + struct RunningDataflow { name: Option, uuid: Uuid, @@ -797,9 +999,66 @@ struct RunningDataflow { exited_before_subscribe: Vec, nodes: BTreeMap, - reply_senders: Vec>>, + spawn_result: CachedResult, + stop_reply_senders: Vec>>, log_subscribers: Vec, + + pending_spawn_results: BTreeSet, +} + +pub enum CachedResult { + Pending { + result_senders: Vec>>, + }, + Cached { + result: eyre::Result, + }, +} + +impl Default for CachedResult { + fn default() -> Self { + Self::Pending { + result_senders: Vec::new(), + } + } +} + +impl CachedResult { + fn register( + &mut self, + reply_sender: tokio::sync::oneshot::Sender>, + ) { + match self { + CachedResult::Pending { result_senders } => result_senders.push(reply_sender), + CachedResult::Cached { result } => { + Self::send_result_to(result, reply_sender); + } + } + } + + fn set_result(&mut self, result: eyre::Result) { + match self { + CachedResult::Pending { result_senders } => { + for sender in result_senders.drain(..) { + Self::send_result_to(&result, sender); + } + *self = CachedResult::Cached { result }; + } + CachedResult::Cached { .. } => {} + } + } + + fn send_result_to( + result: &eyre::Result, + sender: oneshot::Sender>, + ) { + let result = match result { + Ok(r) => Ok(r.clone()), + Err(err) => Err(eyre!("{err:?}")), + }; + let _ = sender.send(result); + } } struct ArchivedDataflow { @@ -943,7 +1202,7 @@ async fn retrieve_logs( let machine_ids: Vec> = nodes .values() .filter(|node| node.id == node_id) - .map(|node| node.deploy.machine.clone()) + .map(|node| node.deploy.as_ref().and_then(|d| d.machine.clone())) .collect(); let machine_id = if let [machine_id] = &machine_ids[..] { @@ -992,9 +1251,127 @@ async fn retrieve_logs( reply_logs.map_err(|err| eyre!(err)) } +#[allow(clippy::too_many_arguments)] +#[tracing::instrument(skip(daemon_connections, clock))] +async fn build_dataflow( + build_id: BuildId, + session_id: SessionId, + dataflow: Descriptor, + git_sources: BTreeMap, + prev_git_sources: BTreeMap, + local_working_dir: Option, + clock: &HLC, + uv: bool, + daemon_connections: &mut DaemonConnections, +) -> eyre::Result { + let nodes = dataflow.resolve_aliases_and_set_defaults()?; + + let mut git_sources_by_daemon = git_sources + .into_iter() + .into_grouping_map_by(|(id, _)| { + nodes + .get(id) + .and_then(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())) + }) + .collect(); + let mut prev_git_sources_by_daemon = prev_git_sources + .into_iter() + .into_grouping_map_by(|(id, _)| { + nodes + .get(id) + .and_then(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())) + }) + .collect(); + + let nodes_by_daemon = nodes + .values() + .into_group_map_by(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())); + + let mut daemons = BTreeSet::new(); + for (machine, nodes_on_machine) in &nodes_by_daemon { + let nodes_on_machine = nodes_on_machine.iter().map(|n| n.id.clone()).collect(); + tracing::debug!( + "Running dataflow build `{build_id}` on machine `{machine:?}` (nodes: {nodes_on_machine:?})" + ); + + let build_command = BuildDataflowNodes { + build_id, + session_id, + local_working_dir: local_working_dir.clone(), + git_sources: git_sources_by_daemon.remove(machine).unwrap_or_default(), + prev_git_sources: prev_git_sources_by_daemon + .remove(machine) + .unwrap_or_default(), + dataflow_descriptor: dataflow.clone(), + nodes_on_machine, + uv, + }; + let message = serde_json::to_vec(&Timestamped { + inner: DaemonCoordinatorEvent::Build(build_command), + timestamp: clock.new_timestamp(), + })?; + + let daemon_id = + build_dataflow_on_machine(daemon_connections, machine.map(|s| s.as_str()), &message) + .await + .wrap_err_with(|| format!("failed to build dataflow on machine `{machine:?}`"))?; + daemons.insert(daemon_id); + } + + tracing::info!("successfully triggered dataflow build `{build_id}`",); + + Ok(RunningBuild { + errors: Vec::new(), + build_result: CachedResult::default(), + log_subscribers: Vec::new(), + pending_build_results: daemons, + }) +} + +async fn build_dataflow_on_machine( + daemon_connections: &mut DaemonConnections, + machine: Option<&str>, + message: &[u8], +) -> Result { + let daemon_id = match machine { + Some(machine) => daemon_connections + .get_matching_daemon_id(machine) + .wrap_err_with(|| format!("no matching daemon for machine id {machine:?}"))? + .clone(), + None => daemon_connections + .unnamed() + .next() + .wrap_err("no unnamed daemon connections")? + .clone(), + }; + + let daemon_connection = daemon_connections + .get_mut(&daemon_id) + .wrap_err_with(|| format!("no daemon connection for daemon `{daemon_id}`"))?; + tcp_send(&mut daemon_connection.stream, message) + .await + .wrap_err("failed to send build message to daemon")?; + + let reply_raw = tcp_receive(&mut daemon_connection.stream) + .await + .wrap_err("failed to receive build reply from daemon")?; + match serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize build reply from daemon")? + { + DaemonCoordinatorReply::TriggerBuildResult(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("daemon returned an error")?, + _ => bail!("unexpected reply"), + } + Ok(daemon_id) +} + +#[allow(clippy::too_many_arguments)] async fn start_dataflow( + build_id: Option, + session_id: SessionId, dataflow: Descriptor, - working_dir: PathBuf, + local_working_dir: Option, name: Option, daemon_connections: &mut DaemonConnections, clock: &HLC, @@ -1004,7 +1381,16 @@ async fn start_dataflow( uuid, daemons, nodes, - } = spawn_dataflow(dataflow, working_dir, daemon_connections, clock, uv).await?; + } = spawn_dataflow( + build_id, + session_id, + dataflow, + local_working_dir, + daemon_connections, + clock, + uv, + ) + .await?; Ok(RunningDataflow { uuid, name, @@ -1014,10 +1400,12 @@ async fn start_dataflow( BTreeSet::new() }, exited_before_subscribe: Default::default(), - daemons, + daemons: daemons.clone(), nodes, - reply_senders: Vec::new(), + spawn_result: CachedResult::default(), + stop_reply_senders: Vec::new(), log_subscribers: Vec::new(), + pending_spawn_results: daemons, }) } @@ -1092,6 +1480,16 @@ pub enum Event { DaemonExit { daemon_id: dora_message::common::DaemonId, }, + DataflowBuildResult { + build_id: BuildId, + daemon_id: DaemonId, + result: eyre::Result<()>, + }, + DataflowSpawnResult { + dataflow_id: uuid::Uuid, + daemon_id: DaemonId, + result: eyre::Result<()>, + }, } impl Event { @@ -1103,6 +1501,23 @@ impl Event { _ => true, } } + + fn kind(&self) -> &'static str { + match self { + Event::NewDaemonConnection(_) => "NewDaemonConnection", + Event::DaemonConnectError(_) => "DaemonConnectError", + Event::DaemonHeartbeat { .. } => "DaemonHeartbeat", + Event::Dataflow { .. } => "Dataflow", + Event::Control(_) => "Control", + Event::Daemon(_) => "Daemon", + Event::DaemonHeartbeatInterval => "DaemonHeartbeatInterval", + Event::CtrlC => "CtrlC", + Event::Log(_) => "Log", + Event::DaemonExit { .. } => "DaemonExit", + Event::DataflowBuildResult { .. } => "DataflowBuildResult", + Event::DataflowSpawnResult { .. } => "DataflowSpawnResult", + } + } } #[derive(Debug)] diff --git a/binaries/coordinator/src/listener.rs b/binaries/coordinator/src/listener.rs index 6c666082..ab7e3b9d 100644 --- a/binaries/coordinator/src/listener.rs +++ b/binaries/coordinator/src/listener.rs @@ -112,6 +112,29 @@ pub async fn handle_connection( break; } } + DaemonEvent::BuildResult { build_id, result } => { + let event = Event::DataflowBuildResult { + build_id, + daemon_id, + result: result.map_err(|err| eyre::eyre!(err)), + }; + if events_tx.send(event).await.is_err() { + break; + } + } + DaemonEvent::SpawnResult { + dataflow_id, + result, + } => { + let event = Event::DataflowSpawnResult { + dataflow_id, + daemon_id, + result: result.map_err(|err| eyre::eyre!(err)), + }; + if events_tx.send(event).await.is_err() { + break; + } + } }, }; } diff --git a/binaries/coordinator/src/log_subscriber.rs b/binaries/coordinator/src/log_subscriber.rs index cb602d47..e5006616 100644 --- a/binaries/coordinator/src/log_subscriber.rs +++ b/binaries/coordinator/src/log_subscriber.rs @@ -17,9 +17,15 @@ impl LogSubscriber { } pub async fn send_message(&mut self, message: &LogMessage) -> eyre::Result<()> { - if message.level > self.level { - return Ok(()); + match message.level { + dora_core::build::LogLevelOrStdout::LogLevel(level) => { + if level > self.level { + return Ok(()); + } + } + dora_core::build::LogLevelOrStdout::Stdout => {} } + let message = serde_json::to_vec(&message)?; let connection = self.connection.as_mut().context("connection is closed")?; tcp_send(connection, &message) diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index f6f88e83..9edcabd3 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -10,6 +10,7 @@ use dora_message::{ daemon_to_coordinator::DaemonCoordinatorReply, descriptor::{Descriptor, ResolvedNode}, id::NodeId, + BuildId, SessionId, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; use itertools::Itertools; @@ -21,8 +22,10 @@ use uuid::{NoContext, Timestamp, Uuid}; #[tracing::instrument(skip(daemon_connections, clock))] pub(super) async fn spawn_dataflow( + build_id: Option, + session_id: SessionId, dataflow: Descriptor, - working_dir: PathBuf, + local_working_dir: Option, daemon_connections: &mut DaemonConnections, clock: &HLC, uv: bool, @@ -30,7 +33,9 @@ pub(super) async fn spawn_dataflow( let nodes = dataflow.resolve_aliases_and_set_defaults()?; let uuid = Uuid::new_v7(Timestamp::now(NoContext)); - let nodes_by_daemon = nodes.values().into_group_map_by(|n| &n.deploy.machine); + let nodes_by_daemon = nodes + .values() + .into_group_map_by(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())); let mut daemons = BTreeSet::new(); for (machine, nodes_on_machine) in &nodes_by_daemon { @@ -40,8 +45,10 @@ pub(super) async fn spawn_dataflow( ); let spawn_command = SpawnDataflowNodes { + build_id, + session_id, dataflow_id: uuid, - working_dir: working_dir.clone(), + local_working_dir: local_working_dir.clone(), nodes: nodes.clone(), dataflow_descriptor: dataflow.clone(), spawn_nodes, @@ -52,13 +59,14 @@ pub(super) async fn spawn_dataflow( timestamp: clock.new_timestamp(), })?; - let daemon_id = spawn_dataflow_on_machine(daemon_connections, machine.as_deref(), &message) - .await - .wrap_err_with(|| format!("failed to spawn dataflow on machine `{machine:?}`"))?; + let daemon_id = + spawn_dataflow_on_machine(daemon_connections, machine.map(|m| m.as_str()), &message) + .await + .wrap_err_with(|| format!("failed to spawn dataflow on machine `{machine:?}`"))?; daemons.insert(daemon_id); } - tracing::info!("successfully spawned dataflow `{uuid}`"); + tracing::info!("successfully triggered dataflow spawn `{uuid}`",); Ok(SpawnedDataflow { uuid, @@ -90,13 +98,14 @@ async fn spawn_dataflow_on_machine( tcp_send(&mut daemon_connection.stream, message) .await .wrap_err("failed to send spawn message to daemon")?; + let reply_raw = tcp_receive(&mut daemon_connection.stream) .await .wrap_err("failed to receive spawn reply from daemon")?; match serde_json::from_slice(&reply_raw) .wrap_err("failed to deserialize spawn reply from daemon")? { - DaemonCoordinatorReply::SpawnResult(result) => result + DaemonCoordinatorReply::TriggerSpawnResult(result) => result .map_err(|e| eyre!(e)) .wrap_err("daemon returned an error")?, _ => bail!("unexpected reply"), diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index ca29d9b5..423dba86 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -24,14 +24,14 @@ tracing = "0.1.36" tracing-opentelemetry = { version = "0.18.0", optional = true } futures-concurrency = "7.1.0" serde_json = "1.0.86" -dora-core = { workspace = true } +dora-core = { workspace = true, features = ["build"] } flume = "0.10.14" dora-download = { workspace = true } dora-tracing = { workspace = true, optional = true } dora-arrow-convert = { workspace = true } dora-node-api = { workspace = true } dora-message = { workspace = true } -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } uuid = { version = "1.7", features = ["v7"] } futures = "0.3.25" shared-memory-server = { workspace = true } @@ -44,3 +44,7 @@ sysinfo = "0.30.11" crossbeam = "0.8.4" crossbeam-skiplist = "0.1.3" zenoh = "1.1.1" +url = "2.5.4" +git2 = { workspace = true } +dunce = "1.0.5" +itertools = "0.14" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index e309e066..d23dd2b8 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -2,6 +2,7 @@ use aligned_vec::{AVec, ConstAlign}; use coordinator::CoordinatorEvent; use crossbeam::queue::ArrayQueue; use dora_core::{ + build::{self, BuildInfo, GitManager, PrevGitSource}, config::{DataId, Input, InputMapping, NodeId, NodeRunConfig, OperatorId}, descriptor::{ read_as_descriptor, CoreNodeKind, Descriptor, DescriptorExt, ResolvedNode, RuntimeNode, @@ -12,18 +13,20 @@ use dora_core::{ }; use dora_message::{ common::{ - DaemonId, DataMessage, DropToken, LogLevel, NodeError, NodeErrorCause, NodeExitStatus, + DaemonId, DataMessage, DropToken, GitSource, LogLevel, NodeError, NodeErrorCause, + NodeExitStatus, }, coordinator_to_cli::DataflowResult, - coordinator_to_daemon::{DaemonCoordinatorEvent, SpawnDataflowNodes}, + coordinator_to_daemon::{BuildDataflowNodes, DaemonCoordinatorEvent, SpawnDataflowNodes}, daemon_to_coordinator::{ CoordinatorRequest, DaemonCoordinatorReply, DaemonEvent, DataflowDaemonResult, }, daemon_to_daemon::InterDaemonEvent, daemon_to_node::{DaemonReply, NodeConfig, NodeDropEvent, NodeEvent}, + descriptor::NodeSource, metadata::{self, ArrowTypeInfo}, node_to_daemon::{DynamicNodeEvent, Timestamped}, - DataflowId, + BuildId, DataflowId, SessionId, }; use dora_node_api::{arrow::datatypes::DataType, Parameter}; use eyre::{bail, eyre, Context, ContextCompat, Result}; @@ -34,8 +37,11 @@ use log::{DaemonLogger, DataflowLogger, Logger}; use pending::PendingNodes; use shared_memory_server::ShmemConf; use socket_stream_utils::socket_stream_send; +use spawn::Spawner; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, + env::current_dir, + future::Future, net::SocketAddr, path::{Path, PathBuf}, pin::pin, @@ -57,6 +63,9 @@ use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use tracing::{error, warn}; use uuid::{NoContext, Timestamp, Uuid}; +pub use flume; +pub use log::LogDestination; + mod coordinator; mod local_listener; mod log; @@ -97,10 +106,20 @@ pub struct Daemon { remote_daemon_events_tx: Option>>>, logger: DaemonLogger, + + sessions: BTreeMap, + builds: BTreeMap, + git_manager: GitManager, } type DaemonRunResult = BTreeMap>>; +struct NodeBuildTask { + node_id: NodeId, + dynamic_node: bool, + task: F, +} + impl Daemon { pub async fn run( coordinator_addr: SocketAddr, @@ -130,6 +149,20 @@ impl Daemon { future::Either::Right((events, _)) => events?, } }; + + let log_destination = { + // additional connection for logging + let stream = TcpStream::connect(coordinator_addr) + .await + .wrap_err("failed to connect log to dora-coordinator")?; + stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; + LogDestination::Coordinator { + coordinator_connection: stream, + } + }; + Self::run_general( (ReceiverStream::new(ctrlc_events), incoming_events).merge(), Some(coordinator_addr), @@ -137,12 +170,21 @@ impl Daemon { None, clock, Some(remote_daemon_events_tx), + Default::default(), + log_destination, ) .await .map(|_| ()) } - pub async fn run_dataflow(dataflow_path: &Path, uv: bool) -> eyre::Result { + pub async fn run_dataflow( + dataflow_path: &Path, + build_id: Option, + local_build: Option, + session_id: SessionId, + uv: bool, + log_destination: LogDestination, + ) -> eyre::Result { let working_dir = dataflow_path .canonicalize() .context("failed to canonicalize dataflow path")? @@ -151,13 +193,24 @@ impl Daemon { .to_owned(); let descriptor = read_as_descriptor(dataflow_path).await?; + if let Some(node) = descriptor.nodes.iter().find(|n| n.deploy.is_some()) { + eyre::bail!( + "node {} has a `deploy` section, which is not supported in `dora run`\n\n + Instead, you need to spawn a `dora coordinator` and one or more `dora daemon` + instances and then use `dora start`.", + node.id + ) + } + descriptor.check(&working_dir)?; let nodes = descriptor.resolve_aliases_and_set_defaults()?; let dataflow_id = Uuid::new_v7(Timestamp::now(NoContext)); let spawn_command = SpawnDataflowNodes { + build_id, + session_id, dataflow_id, - working_dir, + local_working_dir: Some(working_dir), spawn_nodes: nodes.keys().cloned().collect(), nodes, dataflow_descriptor: descriptor, @@ -192,13 +245,24 @@ impl Daemon { Some(exit_when_done), clock.clone(), None, + if let Some(local_build) = local_build { + let Some(build_id) = build_id else { + bail!("no build_id, but local_build set") + }; + let mut builds = BTreeMap::new(); + builds.insert(build_id, local_build); + builds + } else { + Default::default() + }, + log_destination, ); let spawn_result = reply_rx .map_err(|err| eyre!("failed to receive spawn result: {err}")) .and_then(|r| async { match r { - Some(DaemonCoordinatorReply::SpawnResult(result)) => { + Some(DaemonCoordinatorReply::TriggerSpawnResult(result)) => { result.map_err(|err| eyre!(err)) } _ => Err(eyre!("unexpected spawn reply")), @@ -216,6 +280,7 @@ impl Daemon { }) } + #[allow(clippy::too_many_arguments)] async fn run_general( external_events: impl Stream> + Unpin, coordinator_addr: Option, @@ -223,6 +288,8 @@ impl Daemon { exit_when_done: Option>, clock: Arc, remote_daemon_events_tx: Option>>>, + builds: BTreeMap, + log_destination: LogDestination, ) -> eyre::Result { let coordinator_connection = match coordinator_addr { Some(addr) => { @@ -237,20 +304,6 @@ impl Daemon { None => None, }; - // additional connection for logging - let logger_coordinator_connection = match coordinator_addr { - Some(addr) => { - let stream = TcpStream::connect(addr) - .await - .wrap_err("failed to connect log to dora-coordinator")?; - stream - .set_nodelay(true) - .wrap_err("failed to set TCP_NODELAY")?; - Some(stream) - } - None => None, - }; - let zenoh_session = match std::env::var(zenoh::Config::DEFAULT_CONFIG_PATH_ENV) { Ok(path) => { let zenoh_config = zenoh::Config::from_file(&path) @@ -347,7 +400,7 @@ impl Daemon { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let daemon = Self { logger: Logger { - coordinator_connection: logger_coordinator_connection, + destination: log_destination, daemon_id: daemon_id.clone(), clock: clock.clone(), } @@ -364,6 +417,9 @@ impl Daemon { clock, zenoh_session, remote_daemon_events_tx, + git_manager: Default::default(), + builds, + sessions: Default::default(), }; let dora_events = ReceiverStream::new(dora_events_rx); @@ -392,6 +448,10 @@ impl Daemon { tracing::warn!("failed to update HLC with incoming event timestamp: {err}"); } + // used below for checking the duration of event handling + let start = Instant::now(); + let event_kind = inner.kind(); + match inner { Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { let status = self.handle_coordinator_event(event, reply_tx).await?; @@ -409,10 +469,7 @@ impl Daemon { node_id, event, } => self.handle_node_event(event, dataflow, node_id).await?, - Event::Dora(event) => match self.handle_dora_event(event).await? { - RunStatus::Continue => {} - RunStatus::Exit => break, - }, + Event::Dora(event) => self.handle_dora_event(event).await?, Event::DynamicNode(event) => self.handle_dynamic_node_event(event).await?, Event::HeartbeatInterval => { if let Some(connection) = &mut self.coordinator_connection { @@ -457,6 +514,105 @@ impl Daemon { Event::DaemonError(err) => { tracing::error!("Daemon error: {err:?}"); } + Event::SpawnNodeResult { + dataflow_id, + node_id, + dynamic_node, + result, + } => match result { + Ok(running_node) => { + if let Some(dataflow) = self.running.get_mut(&dataflow_id) { + dataflow.running_nodes.insert(node_id, running_node); + } else { + tracing::error!("failed to handle SpawnNodeResult: no running dataflow with ID {dataflow_id}"); + } + } + Err(error) => { + self.dataflow_node_results + .entry(dataflow_id) + .or_default() + .insert(node_id.clone(), Err(error)); + self.handle_node_stop(dataflow_id, &node_id, dynamic_node) + .await?; + } + }, + Event::BuildDataflowResult { + build_id, + session_id, + result, + } => { + let (build_info, result) = match result { + Ok(build_info) => (Some(build_info), Ok(())), + Err(err) => (None, Err(err)), + }; + if let Some(build_info) = build_info { + self.builds.insert(build_id, build_info); + if let Some(old_build_id) = self.sessions.insert(session_id, build_id) { + self.builds.remove(&old_build_id); + } + } + if let Some(connection) = &mut self.coordinator_connection { + let msg = serde_json::to_vec(&Timestamped { + inner: CoordinatorRequest::Event { + daemon_id: self.daemon_id.clone(), + event: DaemonEvent::BuildResult { + build_id, + result: result.map_err(|err| format!("{err:?}")), + }, + }, + timestamp: self.clock.new_timestamp(), + })?; + socket_stream_send(connection, &msg).await.wrap_err( + "failed to send BuildDataflowResult message to dora-coordinator", + )?; + } + } + Event::SpawnDataflowResult { + dataflow_id, + result, + } => { + if let Some(connection) = &mut self.coordinator_connection { + let msg = serde_json::to_vec(&Timestamped { + inner: CoordinatorRequest::Event { + daemon_id: self.daemon_id.clone(), + event: DaemonEvent::SpawnResult { + dataflow_id, + result: result.map_err(|err| format!("{err:?}")), + }, + }, + timestamp: self.clock.new_timestamp(), + })?; + socket_stream_send(connection, &msg).await.wrap_err( + "failed to send SpawnDataflowResult message to dora-coordinator", + )?; + } + } + Event::NodeStopped { + dataflow_id, + node_id, + } => { + if let Some(exit_when_done) = &mut self.exit_when_done { + exit_when_done.remove(&(dataflow_id, node_id)); + if exit_when_done.is_empty() { + tracing::info!( + "exiting daemon because all required dataflows are finished" + ); + break; + } + } + if self.exit_when_all_finished && self.running.is_empty() { + break; + } + } + } + + // warn if event handling took too long -> the main loop should never be blocked for too long + let elapsed = start.elapsed(); + if elapsed > Duration::from_millis(100) { + tracing::warn!( + "Daemon took {}ms for handling event: {event_kind}", + elapsed.as_millis() + ); } } @@ -482,9 +638,73 @@ impl Daemon { reply_tx: Sender>, ) -> eyre::Result { let status = match event { + DaemonCoordinatorEvent::Build(BuildDataflowNodes { + build_id, + session_id, + local_working_dir, + git_sources, + prev_git_sources, + dataflow_descriptor, + nodes_on_machine, + uv, + }) => { + match dataflow_descriptor.communication.remote { + dora_core::config::RemoteCommunicationConfig::Tcp => {} + } + + let base_working_dir = self.base_working_dir(local_working_dir, session_id)?; + + let result = self + .build_dataflow( + build_id, + session_id, + base_working_dir, + git_sources, + prev_git_sources, + dataflow_descriptor, + nodes_on_machine, + uv, + ) + .await; + let (trigger_result, result_task) = match result { + Ok(result_task) => (Ok(()), Some(result_task)), + Err(err) => (Err(format!("{err:?}")), None), + }; + let reply = DaemonCoordinatorReply::TriggerBuildResult(trigger_result); + let _ = reply_tx.send(Some(reply)).map_err(|_| { + error!("could not send `TriggerBuildResult` reply from daemon to coordinator") + }); + + let result_tx = self.events_tx.clone(); + let clock = self.clock.clone(); + if let Some(result_task) = result_task { + tokio::spawn(async move { + let message = Timestamped { + inner: Event::BuildDataflowResult { + build_id, + session_id, + result: result_task.await, + }, + timestamp: clock.new_timestamp(), + }; + let _ = result_tx + .send(message) + .map_err(|_| { + error!( + "could not send `BuildResult` reply from daemon to coordinator" + ) + }) + .await; + }); + } + + RunStatus::Continue + } DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { + build_id, + session_id, dataflow_id, - working_dir, + local_working_dir, nodes, dataflow_descriptor, spawn_nodes, @@ -494,31 +714,50 @@ impl Daemon { dora_core::config::RemoteCommunicationConfig::Tcp => {} } - // Use the working directory if it exists, otherwise use the working directory where the daemon is spawned - let working_dir = if working_dir.exists() { - working_dir - } else { - std::env::current_dir().wrap_err("failed to get current working dir")? - }; + let base_working_dir = self.base_working_dir(local_working_dir, session_id)?; let result = self .spawn_dataflow( + build_id, dataflow_id, - working_dir, + base_working_dir, nodes, dataflow_descriptor, spawn_nodes, uv, ) .await; - if let Err(err) = &result { - tracing::error!("{err:?}"); - } - let reply = - DaemonCoordinatorReply::SpawnResult(result.map_err(|err| format!("{err:?}"))); + let (trigger_result, result_task) = match result { + Ok(result_task) => (Ok(()), Some(result_task)), + Err(err) => (Err(format!("{err:?}")), None), + }; + let reply = DaemonCoordinatorReply::TriggerSpawnResult(trigger_result); let _ = reply_tx.send(Some(reply)).map_err(|_| { - error!("could not send `SpawnResult` reply from daemon to coordinator") + error!("could not send `TriggerSpawnResult` reply from daemon to coordinator") }); + + let result_tx = self.events_tx.clone(); + let clock = self.clock.clone(); + if let Some(result_task) = result_task { + tokio::spawn(async move { + let message = Timestamped { + inner: Event::SpawnDataflowResult { + dataflow_id, + result: result_task.await, + }, + timestamp: clock.new_timestamp(), + }; + let _ = result_tx + .send(message) + .map_err(|_| { + error!( + "could not send `SpawnResult` reply from daemon to coordinator" + ) + }) + .await; + }); + } + RunStatus::Continue } DaemonCoordinatorEvent::AllNodesReady { @@ -750,21 +989,125 @@ impl Daemon { } } + #[allow(clippy::too_many_arguments)] + async fn build_dataflow( + &mut self, + build_id: BuildId, + session_id: SessionId, + base_working_dir: PathBuf, + git_sources: BTreeMap, + prev_git_sources: BTreeMap, + dataflow_descriptor: Descriptor, + local_nodes: BTreeSet, + uv: bool, + ) -> eyre::Result>> { + let builder = build::Builder { + session_id, + base_working_dir, + uv, + }; + self.git_manager.clear_planned_builds(session_id); + + let nodes = dataflow_descriptor.resolve_aliases_and_set_defaults()?; + + let mut tasks = Vec::new(); + + // build nodes + for node in nodes.into_values().filter(|n| local_nodes.contains(&n.id)) { + let dynamic_node = node.kind.dynamic(); + + let node_id = node.id.clone(); + let mut logger = self.logger.for_node_build(build_id, node_id.clone()); + logger.log(LogLevel::Info, "building").await; + let git_source = git_sources.get(&node_id).cloned(); + let prev_git_source = prev_git_sources.get(&node_id).cloned(); + let prev_git = prev_git_source.map(|prev_source| PrevGitSource { + still_needed_for_this_build: git_sources.values().any(|s| s == &prev_source), + git_source: prev_source, + }); + + let logger_cloned = logger + .try_clone_impl() + .await + .wrap_err("failed to clone logger")?; + + let mut builder = builder.clone(); + if let Some(node_working_dir) = + node.deploy.as_ref().and_then(|d| d.working_dir.as_deref()) + { + builder.base_working_dir = builder.base_working_dir.join(node_working_dir); + } + + match builder + .build_node( + node, + git_source, + prev_git, + logger_cloned, + &mut self.git_manager, + ) + .await + .wrap_err_with(|| format!("failed to build node `{node_id}`")) + { + Ok(result) => { + tasks.push(NodeBuildTask { + node_id, + task: result, + dynamic_node, + }); + } + Err(err) => { + logger.log(LogLevel::Error, format!("{err:?}")).await; + return Err(err); + } + } + } + + let task = async move { + let mut info = BuildInfo { + node_working_dirs: Default::default(), + }; + for task in tasks { + let NodeBuildTask { + node_id, + dynamic_node, + task, + } = task; + let node = task + .await + .with_context(|| format!("failed to build node `{node_id}`"))?; + info.node_working_dirs + .insert(node_id, node.node_working_dir); + } + Ok(info) + }; + + Ok(task) + } + + #[allow(clippy::too_many_arguments)] async fn spawn_dataflow( &mut self, - dataflow_id: uuid::Uuid, - working_dir: PathBuf, + build_id: Option, + dataflow_id: DataflowId, + base_working_dir: PathBuf, nodes: BTreeMap, dataflow_descriptor: Descriptor, spawn_nodes: BTreeSet, uv: bool, - ) -> eyre::Result<()> { - let mut logger = self.logger.for_dataflow(dataflow_id); + ) -> eyre::Result>> { + let mut logger = self + .logger + .for_dataflow(dataflow_id) + .try_clone() + .await + .context("failed to clone logger")?; let dataflow = RunningDataflow::new(dataflow_id, self.daemon_id.clone(), &dataflow_descriptor); let dataflow = match self.running.entry(dataflow_id) { std::collections::hash_map::Entry::Vacant(entry) => { - self.working_dir.insert(dataflow_id, working_dir.clone()); + self.working_dir + .insert(dataflow_id, base_working_dir.clone()); entry.insert(dataflow) } std::collections::hash_map::Entry::Occupied(_) => { @@ -774,6 +1117,11 @@ impl Daemon { let mut stopped = Vec::new(); + let node_working_dirs = build_id + .and_then(|build_id| self.builds.get(&build_id)) + .map(|info| info.node_working_dirs.clone()) + .unwrap_or_default(); + // calculate info about mappings for node in nodes.values() { let local = spawn_nodes.contains(&node.id); @@ -810,12 +1158,23 @@ impl Daemon { } } + let spawner = Spawner { + dataflow_id, + daemon_tx: self.events_tx.clone(), + dataflow_descriptor, + clock: self.clock.clone(), + uv, + }; + + let mut tasks = Vec::new(); + // spawn nodes and set up subscriptions for node in nodes.into_values() { let mut logger = logger.reborrow().for_node(node.id.clone()); let local = spawn_nodes.contains(&node.id); if local { - if node.kind.dynamic() { + let dynamic_node = node.kind.dynamic(); + if dynamic_node { dataflow.dynamic_nodes.insert(node.id.clone()); } else { dataflow.pending_nodes.insert(node.id.clone()); @@ -830,22 +1189,28 @@ impl Daemon { logger .log(LogLevel::Info, Some("daemon".into()), "spawning") .await; - match spawn::spawn_node( - dataflow_id, - &working_dir, - node, - self.events_tx.clone(), - dataflow_descriptor.clone(), - self.clock.clone(), - node_stderr_most_recent, - uv, - &mut logger, - ) - .await - .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) + let node_working_dir = node_working_dirs + .get(&node_id) + .cloned() + .or_else(|| { + node.deploy + .as_ref() + .and_then(|d| d.working_dir.as_ref().map(|d| base_working_dir.join(d))) + }) + .unwrap_or(base_working_dir.clone()) + .clone(); + match spawner + .clone() + .spawn_node(node, node_working_dir, node_stderr_most_recent, &mut logger) + .await + .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { - Ok(running_node) => { - dataflow.running_nodes.insert(node_id, running_node); + Ok(result) => { + tasks.push(NodeBuildTask { + node_id, + task: result, + dynamic_node, + }); } Err(err) => { logger @@ -858,13 +1223,11 @@ impl Daemon { node_id.clone(), Err(NodeError { timestamp: self.clock.new_timestamp(), - cause: NodeErrorCause::Other { - stderr: format!("spawn failed: {err:?}"), - }, + cause: NodeErrorCause::FailedToSpawn(format!("{err:?}")), exit_status: NodeExitStatus::Unknown, }), ); - stopped.push(node_id.clone()); + stopped.push((node_id.clone(), dynamic_node)); } } } else { @@ -922,11 +1285,133 @@ impl Daemon { } } } - for node_id in stopped { - self.handle_node_stop(dataflow_id, &node_id).await?; + for (node_id, dynamic) in stopped { + self.handle_node_stop(dataflow_id, &node_id, dynamic) + .await?; } - Ok(()) + let spawn_result = Self::spawn_prepared_nodes( + dataflow_id, + logger, + tasks, + self.events_tx.clone(), + self.clock.clone(), + ); + + Ok(spawn_result) + } + + async fn spawn_prepared_nodes( + dataflow_id: Uuid, + mut logger: DataflowLogger<'_>, + tasks: Vec>>>, + events_tx: mpsc::Sender>, + clock: Arc, + ) -> eyre::Result<()> { + let node_result = |node_id, dynamic_node, result| Timestamped { + inner: Event::SpawnNodeResult { + dataflow_id, + node_id, + dynamic_node, + result, + }, + timestamp: clock.new_timestamp(), + }; + let mut failed_to_prepare = None; + let mut prepared_nodes = Vec::new(); + for task in tasks { + let NodeBuildTask { + node_id, + dynamic_node, + task, + } = task; + match task.await { + Ok(node) => prepared_nodes.push(node), + Err(err) => { + if failed_to_prepare.is_none() { + failed_to_prepare = Some(node_id.clone()); + } + let node_err: NodeError = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::FailedToSpawn(format!( + "preparing for spawn failed: {err:?}" + )), + exit_status: NodeExitStatus::Unknown, + }; + let send_result = events_tx + .send(node_result(node_id, dynamic_node, Err(node_err))) + .await; + if send_result.is_err() { + tracing::error!("failed to send SpawnNodeResult to main daemon task") + } + } + } + } + + // once all nodes are prepared, do the actual spawning + if let Some(failed_node) = failed_to_prepare { + // don't spawn any nodes when an error occurred before + for node in prepared_nodes { + let err = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::Cascading { + caused_by_node: failed_node.clone(), + }, + exit_status: NodeExitStatus::Unknown, + }; + let send_result = events_tx + .send(node_result( + node.node_id().clone(), + node.dynamic(), + Err(err), + )) + .await; + if send_result.is_err() { + tracing::error!("failed to send SpawnNodeResult to main daemon task") + } + } + Err(eyre!("failed to prepare node {failed_node}")) + } else { + let mut spawn_result = Ok(()); + + logger + .log( + LogLevel::Info, + None, + Some("dora daemon".into()), + "finished building nodes, spawning...", + ) + .await; + + // spawn the nodes + for node in prepared_nodes { + let node_id = node.node_id().clone(); + let dynamic_node = node.dynamic(); + let mut logger = logger.reborrow().for_node(node_id.clone()); + let result = node.spawn(&mut logger).await; + let node_spawn_result = match result { + Ok(node) => Ok(node), + Err(err) => { + let node_err = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::FailedToSpawn(format!("spawn failed: {err:?}")), + exit_status: NodeExitStatus::Unknown, + }; + if spawn_result.is_ok() { + spawn_result = Err(err.wrap_err(format!("failed to spawn {node_id}"))); + } + Err(node_err) + } + }; + let send_result = events_tx + .send(node_result(node_id, dynamic_node, node_spawn_result)) + .await; + if send_result.is_err() { + tracing::error!("failed to send SpawnNodeResult to main daemon task") + } + } + spawn_result + } } async fn handle_dynamic_node_event( @@ -946,7 +1431,7 @@ impl Daemon { let node_config = match number_node_id { 2.. => Err(format!( - "multiple dataflows contains dynamic node id {node_id}. \ + "multiple dataflows contain dynamic node id {node_id}. \ Please only have one running dataflow with the specified \ node id if you want to use dynamic node", )), @@ -958,7 +1443,9 @@ impl Daemon { let node_config = dataflow .running_nodes .get(&node_id) - .context("no node with ID `{node_id}` within the given dataflow")? + .with_context(|| { + format!("no node with ID `{node_id}` within the given dataflow") + })? .node_config .clone(); if !node_config.dynamic { @@ -974,7 +1461,7 @@ impl Daemon { "failed to get dynamic node config within given dataflow: {err}" ) }), - 0 => Err("no node with ID `{node_id}`".to_string()), + 0 => Err(format!("no node with ID `{node_id}`")), }; let reply = DaemonReply::NodeConfig { @@ -1348,11 +1835,49 @@ impl Daemon { Ok(()) } - async fn handle_node_stop(&mut self, dataflow_id: Uuid, node_id: &NodeId) -> eyre::Result<()> { + async fn handle_node_stop( + &mut self, + dataflow_id: Uuid, + node_id: &NodeId, + dynamic_node: bool, + ) -> eyre::Result<()> { + let result = self + .handle_node_stop_inner(dataflow_id, node_id, dynamic_node) + .await; + let _ = self + .events_tx + .send(Timestamped { + inner: Event::NodeStopped { + dataflow_id, + node_id: node_id.clone(), + }, + timestamp: self.clock.new_timestamp(), + }) + .await; + result + } + + async fn handle_node_stop_inner( + &mut self, + dataflow_id: Uuid, + node_id: &NodeId, + dynamic_node: bool, + ) -> eyre::Result<()> { let mut logger = self.logger.for_dataflow(dataflow_id); - let dataflow = self.running.get_mut(&dataflow_id).wrap_err_with(|| { - format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`") - })?; + let dataflow = match self.running.get_mut(&dataflow_id) { + Some(dataflow) => dataflow, + None if dynamic_node => { + // The dataflow might be done already as we don't wait for dynamic nodes. In this + // case, we don't need to do anything to handle the node stop. + tracing::debug!( + "dynamic node {dataflow_id}/{node_id} stopped after dataflow was done" + ); + return Ok(()); + } + None => eyre::bail!( + "failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`" + ), + }; dataflow .pending_nodes @@ -1374,10 +1899,11 @@ impl Daemon { if let Some(mut pid) = dataflow.running_nodes.remove(node_id).and_then(|n| n.pid) { pid.mark_as_stopped() } - if dataflow - .running_nodes - .iter() - .all(|(_id, n)| n.node_config.dynamic) + if !dataflow.pending_nodes.local_nodes_pending() + && dataflow + .running_nodes + .iter() + .all(|(_id, n)| n.node_config.dynamic) { let result = DataflowDaemonResult { timestamp: self.clock.new_timestamp(), @@ -1388,6 +1914,13 @@ impl Daemon { .clone(), }; + self.git_manager + .clones_in_use + .values_mut() + .for_each(|dataflows| { + dataflows.remove(&dataflow_id); + }); + logger .log( LogLevel::Info, @@ -1417,7 +1950,7 @@ impl Daemon { Ok(()) } - async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result { + async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { match event { DoraEvent::Timer { dataflow_id, @@ -1426,11 +1959,11 @@ impl Daemon { } => { let Some(dataflow) = self.running.get_mut(&dataflow_id) else { tracing::warn!("Timer event for unknown dataflow `{dataflow_id}`"); - return Ok(RunStatus::Continue); + return Ok(()); }; let Some(subscribers) = dataflow.timers.get(&interval) else { - return Ok(RunStatus::Continue); + return Ok(()); }; let mut closed = Vec::new(); @@ -1467,7 +2000,7 @@ impl Daemon { } => { let Some(dataflow) = self.running.get_mut(&dataflow_id) else { tracing::warn!("Logs event for unknown dataflow `{dataflow_id}`"); - return Ok(RunStatus::Continue); + return Ok(()); }; let Some(subscribers) = dataflow.mappings.get(&output_id) else { @@ -1476,7 +2009,7 @@ impl Daemon { output_id, dataflow.mappings ); - return Ok(RunStatus::Continue); + return Ok(()); }; let mut closed = Vec::new(); @@ -1509,6 +2042,7 @@ impl Daemon { DoraEvent::SpawnedNodeResult { dataflow_id, node_id, + dynamic_node, exit_status, } => { let mut logger = self @@ -1596,23 +2130,39 @@ impl Daemon { .or_default() .insert(node_id.clone(), node_result); - self.handle_node_stop(dataflow_id, &node_id).await?; + self.handle_node_stop(dataflow_id, &node_id, dynamic_node) + .await?; + } + } + Ok(()) + } - if let Some(exit_when_done) = &mut self.exit_when_done { - exit_when_done.remove(&(dataflow_id, node_id)); - if exit_when_done.is_empty() { - tracing::info!( - "exiting daemon because all required dataflows are finished" - ); - return Ok(RunStatus::Exit); - } - } - if self.exit_when_all_finished && self.running.is_empty() { - return Ok(RunStatus::Exit); + fn base_working_dir( + &self, + local_working_dir: Option, + session_id: SessionId, + ) -> eyre::Result { + match local_working_dir { + Some(working_dir) => { + // check that working directory exists + if working_dir.exists() { + Ok(working_dir) + } else { + bail!( + "working directory does not exist: {}", + working_dir.display(), + ) } } + None => { + // use subfolder of daemon working dir + let daemon_working_dir = + current_dir().context("failed to get daemon working dir")?; + Ok(daemon_working_dir + .join("_work") + .join(session_id.uuid().to_string())) + } } - Ok(RunStatus::Continue) } } @@ -1777,7 +2327,7 @@ fn close_input( } #[derive(Debug)] -struct RunningNode { +pub struct RunningNode { pid: Option, node_config: NodeConfig, } @@ -2082,6 +2632,25 @@ pub enum Event { CtrlC, SecondCtrlC, DaemonError(eyre::Report), + SpawnNodeResult { + dataflow_id: DataflowId, + node_id: NodeId, + dynamic_node: bool, + result: Result, + }, + BuildDataflowResult { + build_id: BuildId, + session_id: SessionId, + result: eyre::Result, + }, + SpawnDataflowResult { + dataflow_id: Uuid, + result: eyre::Result<()>, + }, + NodeStopped { + dataflow_id: Uuid, + node_id: NodeId, + }, } impl From for Event { @@ -2090,6 +2659,26 @@ impl From for Event { } } +impl Event { + pub fn kind(&self) -> &'static str { + match self { + Event::Node { .. } => "Node", + Event::Coordinator(_) => "Coordinator", + Event::Daemon(_) => "Daemon", + Event::Dora(_) => "Dora", + Event::DynamicNode(_) => "DynamicNode", + Event::HeartbeatInterval => "HeartbeatInterval", + Event::CtrlC => "CtrlC", + Event::SecondCtrlC => "SecondCtrlC", + Event::DaemonError(_) => "DaemonError", + Event::SpawnNodeResult { .. } => "SpawnNodeResult", + Event::BuildDataflowResult { .. } => "BuildDataflowResult", + Event::SpawnDataflowResult { .. } => "SpawnDataflowResult", + Event::NodeStopped { .. } => "NodeStopped", + } + } +} + #[derive(Debug)] pub enum DaemonNodeEvent { OutputsDone { @@ -2136,6 +2725,7 @@ pub enum DoraEvent { SpawnedNodeResult { dataflow_id: DataflowId, node_id: NodeId, + dynamic_node: bool, exit_status: NodeExitStatus, }, } @@ -2255,7 +2845,9 @@ impl CoreNodeKindExt for CoreNodeKind { fn dynamic(&self) -> bool { match self { CoreNodeKind::Runtime(_n) => false, - CoreNodeKind::Custom(n) => n.source == DYNAMIC_SOURCE, + CoreNodeKind::Custom(n) => { + matches!(&n.source, NodeSource::Local) && n.path == DYNAMIC_SOURCE + } } } } diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index c9e41334..283213c8 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -1,14 +1,21 @@ use std::{ + ops::{Deref, DerefMut}, path::{Path, PathBuf}, sync::Arc, }; -use dora_core::{config::NodeId, uhlc}; +use dora_core::{ + build::{BuildLogger, LogLevelOrStdout}, + config::NodeId, + uhlc, +}; use dora_message::{ common::{DaemonId, LogLevel, LogMessage, Timestamped}, daemon_to_coordinator::{CoordinatorRequest, DaemonEvent}, + BuildId, }; use eyre::Context; +use flume::Sender; use tokio::net::TcpStream; use uuid::Uuid; @@ -39,11 +46,18 @@ impl NodeLogger<'_> { .log(level, Some(self.node_id.clone()), target, message) .await } + + pub async fn try_clone(&self) -> eyre::Result> { + Ok(NodeLogger { + node_id: self.node_id.clone(), + logger: self.logger.try_clone().await?, + }) + } } pub struct DataflowLogger<'a> { dataflow_id: Uuid, - logger: &'a mut DaemonLogger, + logger: CowMut<'a, DaemonLogger>, } impl<'a> DataflowLogger<'a> { @@ -57,12 +71,12 @@ impl<'a> DataflowLogger<'a> { pub fn reborrow(&mut self) -> DataflowLogger { DataflowLogger { dataflow_id: self.dataflow_id, - logger: self.logger, + logger: CowMut::Borrowed(&mut self.logger), } } pub fn inner(&self) -> &DaemonLogger { - self.logger + &self.logger } pub async fn log( @@ -73,9 +87,64 @@ impl<'a> DataflowLogger<'a> { message: impl Into, ) { self.logger - .log(level, self.dataflow_id, node_id, target, message) + .log(level, Some(self.dataflow_id), node_id, target, message) .await } + + pub async fn try_clone(&self) -> eyre::Result> { + Ok(DataflowLogger { + dataflow_id: self.dataflow_id, + logger: CowMut::Owned(self.logger.try_clone().await?), + }) + } +} + +pub struct NodeBuildLogger<'a> { + build_id: BuildId, + node_id: NodeId, + logger: CowMut<'a, DaemonLogger>, +} + +impl NodeBuildLogger<'_> { + pub async fn log( + &mut self, + level: impl Into + Send, + message: impl Into, + ) { + self.logger + .log_build( + self.build_id, + level.into(), + None, + Some(self.node_id.clone()), + message, + ) + .await + } + + pub async fn try_clone_impl(&self) -> eyre::Result> { + Ok(NodeBuildLogger { + build_id: self.build_id, + node_id: self.node_id.clone(), + logger: CowMut::Owned(self.logger.try_clone().await?), + }) + } +} + +impl BuildLogger for NodeBuildLogger<'_> { + type Clone = NodeBuildLogger<'static>; + + fn log_message( + &mut self, + level: impl Into + Send, + message: impl Into + Send, + ) -> impl std::future::Future + Send { + self.log(level, message) + } + + fn try_clone(&self) -> impl std::future::Future> + Send { + self.try_clone_impl() + } } pub struct DaemonLogger { @@ -87,7 +156,15 @@ impl DaemonLogger { pub fn for_dataflow(&mut self, dataflow_id: Uuid) -> DataflowLogger { DataflowLogger { dataflow_id, - logger: self, + logger: CowMut::Borrowed(self), + } + } + + pub fn for_node_build(&mut self, build_id: BuildId, node_id: NodeId) -> NodeBuildLogger { + NodeBuildLogger { + build_id, + node_id, + logger: CowMut::Borrowed(self), } } @@ -98,15 +175,39 @@ impl DaemonLogger { pub async fn log( &mut self, level: LogLevel, - dataflow_id: Uuid, + dataflow_id: Option, node_id: Option, target: Option, message: impl Into, ) { let message = LogMessage { + build_id: None, daemon_id: Some(self.daemon_id.clone()), dataflow_id, node_id, + level: level.into(), + target, + module_path: None, + file: None, + line: None, + message: message.into(), + }; + self.logger.log(message).await + } + + pub async fn log_build( + &mut self, + build_id: BuildId, + level: LogLevelOrStdout, + target: Option, + node_id: Option, + message: impl Into, + ) { + let message = LogMessage { + build_id: Some(build_id), + daemon_id: Some(self.daemon_id.clone()), + dataflow_id: None, + node_id, level, target, module_path: None, @@ -120,10 +221,17 @@ impl DaemonLogger { pub(crate) fn daemon_id(&self) -> &DaemonId { &self.daemon_id } + + pub async fn try_clone(&self) -> eyre::Result { + Ok(Self { + daemon_id: self.daemon_id.clone(), + logger: self.logger.try_clone().await?, + }) + } } pub struct Logger { - pub(super) coordinator_connection: Option, + pub(super) destination: LogDestination, pub(super) daemon_id: DaemonId, pub(super) clock: Arc, } @@ -137,73 +245,179 @@ impl Logger { } pub async fn log(&mut self, message: LogMessage) { - if let Some(connection) = &mut self.coordinator_connection { - let msg = serde_json::to_vec(&Timestamped { - inner: CoordinatorRequest::Event { - daemon_id: self.daemon_id.clone(), - event: DaemonEvent::Log(message.clone()), - }, - timestamp: self.clock.new_timestamp(), - }) - .expect("failed to serialize log message"); - match socket_stream_send(connection, &msg) - .await - .wrap_err("failed to send log message to dora-coordinator") - { - Ok(()) => return, - Err(err) => tracing::warn!("{err:?}"), + match &mut self.destination { + LogDestination::Coordinator { + coordinator_connection, + } => { + let message = Timestamped { + inner: CoordinatorRequest::Event { + daemon_id: self.daemon_id.clone(), + event: DaemonEvent::Log(message.clone()), + }, + timestamp: self.clock.new_timestamp(), + }; + Self::log_to_coordinator(message, coordinator_connection).await } - } - - // log message using tracing if reporting to coordinator is not possible - match message.level { - LogLevel::Error => { - if let Some(node_id) = message.node_id { - tracing::error!("{}/{} errored:", message.dataflow_id.to_string(), node_id); - } - for line in message.message.lines() { - tracing::error!(" {}", line); - } + LogDestination::Channel { sender } => { + let _ = sender.send_async(message).await; } - LogLevel::Warn => { - if let Some(node_id) = message.node_id { - tracing::warn!("{}/{} warned:", message.dataflow_id.to_string(), node_id); - } - for line in message.message.lines() { - tracing::warn!(" {}", line); + LogDestination::Tracing => { + // log message using tracing if reporting to coordinator is not possible + match message.level { + LogLevelOrStdout::Stdout => { + tracing::info!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ) + } + LogLevelOrStdout::LogLevel(level) => match level { + LogLevel::Error => { + tracing::error!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Warn => { + tracing::warn!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Info => { + tracing::info!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Debug => { + tracing::debug!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + _ => {} + }, } } - LogLevel::Info => { - if let Some(node_id) = message.node_id { - tracing::info!("{}/{} info:", message.dataflow_id.to_string(), node_id); - } - - for line in message.message.lines() { - tracing::info!(" {}", line); - } - } - _ => {} } } pub async fn try_clone(&self) -> eyre::Result { - let coordinator_connection = match &self.coordinator_connection { - Some(c) => { - let addr = c + let destination = match &self.destination { + LogDestination::Coordinator { + coordinator_connection, + } => { + let addr = coordinator_connection .peer_addr() .context("failed to get coordinator peer addr")?; let new_connection = TcpStream::connect(addr) .await .context("failed to connect to coordinator during logger clone")?; - Some(new_connection) + LogDestination::Coordinator { + coordinator_connection: new_connection, + } } - None => None, + LogDestination::Channel { sender } => LogDestination::Channel { + sender: sender.clone(), + }, + LogDestination::Tracing => LogDestination::Tracing, }; Ok(Self { - coordinator_connection, + destination, daemon_id: self.daemon_id.clone(), clock: self.clock.clone(), }) } + + async fn log_to_coordinator( + message: Timestamped, + connection: &mut TcpStream, + ) { + let msg = serde_json::to_vec(&message).expect("failed to serialize log message"); + match socket_stream_send(connection, &msg) + .await + .wrap_err("failed to send log message to dora-coordinator") + { + Ok(()) => return, + Err(err) => tracing::warn!("{err:?}"), + } + } +} + +pub enum LogDestination { + Coordinator { coordinator_connection: TcpStream }, + Channel { sender: Sender }, + Tracing, +} + +enum CowMut<'a, T> { + Borrowed(&'a mut T), + Owned(T), +} + +impl Deref for CowMut<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + CowMut::Borrowed(v) => v, + CowMut::Owned(v) => v, + } + } +} + +impl DerefMut for CowMut<'_, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + match self { + CowMut::Borrowed(v) => v, + CowMut::Owned(v) => v, + } + } +} + +struct Indent<'a>(&'a str); + +impl std::fmt::Display for Indent<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for line in self.0.lines() { + write!(f, " {}", line)?; + } + Ok(()) + } } diff --git a/binaries/daemon/src/pending.rs b/binaries/daemon/src/pending.rs index 89305d80..757a858d 100644 --- a/binaries/daemon/src/pending.rs +++ b/binaries/daemon/src/pending.rs @@ -59,6 +59,10 @@ impl PendingNodes { self.external_nodes = value; } + pub fn local_nodes_pending(&self) -> bool { + !self.local_nodes.is_empty() + } + pub async fn handle_node_subscription( &mut self, node_id: NodeId, diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 955a1dc7..2ecc5c1a 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -20,6 +20,7 @@ use dora_message::{ common::{LogLevel, LogMessage}, daemon_to_coordinator::{DataMessage, NodeExitStatus, Timestamped}, daemon_to_node::{NodeConfig, RuntimeConfig}, + id::NodeId, DataflowId, }; use dora_node_api::{ @@ -29,6 +30,7 @@ use dora_node_api::{ }; use eyre::{bail, ContextCompat, WrapErr}; use std::{ + future::Future, path::{Path, PathBuf}, process::Stdio, sync::Arc, @@ -40,565 +42,671 @@ use tokio::{ }; use tracing::error; -/// clock is required for generating timestamps when dropping messages early because queue is full -pub async fn spawn_node( - dataflow_id: DataflowId, - working_dir: &Path, - node: ResolvedNode, - daemon_tx: mpsc::Sender>, - dataflow_descriptor: Descriptor, - clock: Arc, - node_stderr_most_recent: Arc>, - uv: bool, - logger: &mut NodeLogger<'_>, -) -> eyre::Result { - let node_id = node.id.clone(); - logger - .log( - LogLevel::Debug, - Some("daemon::spawner".into()), - "spawning node", +#[derive(Clone)] +pub struct Spawner { + pub dataflow_id: DataflowId, + pub daemon_tx: mpsc::Sender>, + pub dataflow_descriptor: Descriptor, + /// clock is required for generating timestamps when dropping messages early because queue is full + pub clock: Arc, + pub uv: bool, +} + +impl Spawner { + pub async fn spawn_node( + self, + node: ResolvedNode, + node_working_dir: PathBuf, + node_stderr_most_recent: Arc>, + logger: &mut NodeLogger<'_>, + ) -> eyre::Result>> { + let dataflow_id = self.dataflow_id; + let node_id = node.id.clone(); + logger + .log( + LogLevel::Debug, + Some("daemon::spawner".into()), + "spawning node", + ) + .await; + + let queue_sizes = node_inputs(&node) + .into_iter() + .map(|(k, v)| (k, v.queue_size.unwrap_or(10))) + .collect(); + let daemon_communication = spawn_listener_loop( + &dataflow_id, + &node_id, + &self.daemon_tx, + self.dataflow_descriptor.communication.local, + queue_sizes, + self.clock.clone(), ) - .await; - - let queue_sizes = node_inputs(&node) - .into_iter() - .map(|(k, v)| (k, v.queue_size.unwrap_or(10))) - .collect(); - let daemon_communication = spawn_listener_loop( - &dataflow_id, - &node_id, - &daemon_tx, - dataflow_descriptor.communication.local, - queue_sizes, - clock.clone(), - ) - .await?; - let send_stdout_to = node - .send_stdout_as() - .context("Could not resolve `send_stdout_as` configuration")?; - - let node_config = NodeConfig { - dataflow_id, - node_id: node_id.clone(), - run_config: node.kind.run_config(), - daemon_communication, - dataflow_descriptor, - dynamic: node.kind.dynamic(), - }; + .await?; - let mut child = match node.kind { - dora_core::descriptor::CoreNodeKind::Custom(n) => { - let mut command = match n.source.as_str() { - DYNAMIC_SOURCE => { - return Ok(RunningNode { - pid: None, - node_config, - }); - } - SHELL_SOURCE => { - if cfg!(target_os = "windows") { - let mut cmd = tokio::process::Command::new("cmd"); - cmd.args(["/C", &n.args.clone().unwrap_or_default()]); - cmd - } else { - let mut cmd = tokio::process::Command::new("sh"); - cmd.args(["-c", &n.args.clone().unwrap_or_default()]); - cmd - } - } - source => { - let resolved_path = if source_is_url(source) { - // try to download the shared library - let target_dir = Path::new("build"); - download_file(source, target_dir) - .await - .wrap_err("failed to download custom node")? - } else { - resolve_path(source, working_dir).wrap_err_with(|| { - format!("failed to resolve node source `{}`", source) - })? - }; + let node_config = NodeConfig { + dataflow_id, + node_id: node_id.clone(), + run_config: node.kind.run_config(), + daemon_communication, + dataflow_descriptor: serde_yaml::to_value(&self.dataflow_descriptor) + .context("failed to serialize dataflow descriptor to YAML")?, + dynamic: node.kind.dynamic(), + }; - // If extension is .py, use python to run the script - let mut cmd = match resolved_path.extension().map(|ext| ext.to_str()) { - Some(Some("py")) => { - let mut cmd = if uv { - let mut cmd = tokio::process::Command::new("uv"); - cmd.arg("run"); - cmd.arg("python"); - logger - .log( - LogLevel::Info, - Some("spawner".into()), - format!( - "spawning: uv run python -u {}", - resolved_path.display() - ), - ) - .await; - cmd - } else { - let python = get_python_path().wrap_err( - "Could not find python path when spawning custom node", - )?; - logger - .log( - LogLevel::Info, - Some("spawner".into()), - format!( - "spawning: {:?} -u {}", - &python, - resolved_path.display() - ), - ) - .await; - - tokio::process::Command::new(python) - }; - // Force python to always flush stdout/stderr buffer - cmd.arg("-u"); - cmd.arg(&resolved_path); - cmd + let mut logger = logger + .try_clone() + .await + .wrap_err("failed to clone logger")?; + let task = async move { + self.prepare_node_inner( + node, + node_working_dir, + &mut logger, + dataflow_id, + node_config, + node_stderr_most_recent, + ) + .await + }; + Ok(task) + } + + async fn prepare_node_inner( + self, + node: ResolvedNode, + node_working_dir: PathBuf, + logger: &mut NodeLogger<'_>, + dataflow_id: uuid::Uuid, + node_config: NodeConfig, + node_stderr_most_recent: Arc>, + ) -> eyre::Result { + let (command, error_msg) = match &node.kind { + dora_core::descriptor::CoreNodeKind::Custom(n) => { + let mut command = + path_spawn_command(&node_working_dir, self.uv, logger, n, true).await?; + + if let Some(command) = &mut command { + command.current_dir(&node_working_dir); + command.stdin(Stdio::null()); + + command.env( + "DORA_NODE_CONFIG", + serde_yaml::to_string(&node_config.clone()) + .wrap_err("failed to serialize node config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = &node.env { + for (key, value) in envs { + command.env(key, value.to_string()); } - _ => { - logger - .log( - LogLevel::Info, - Some("spawner".into()), - format!("spawning: {}", resolved_path.display()), - ) - .await; - if uv { - let mut cmd = tokio::process::Command::new("uv"); - cmd.arg("run"); - cmd.arg(&resolved_path); - cmd - } else { - tokio::process::Command::new(&resolved_path) - } + } + if let Some(envs) = &n.envs { + // node has some inner env variables -> add them too + for (key, value) in envs { + command.env(key, value.to_string()); } - }; - - if let Some(args) = &n.args { - cmd.args(args.split_ascii_whitespace()); } - cmd - } - }; - command.current_dir(working_dir); - command.stdin(Stdio::null()); - - command.env( - "DORA_NODE_CONFIG", - serde_yaml::to_string(&node_config.clone()) - .wrap_err("failed to serialize node config")?, - ); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.env { - for (key, value) in envs { - command.env(key, value.to_string()); - } - } - if let Some(envs) = n.envs { - // node has some inner env variables -> add them too - for (key, value) in envs { - command.env(key, value.to_string()); - } - } + // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C + #[cfg(unix)] + command.process_group(0); - // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C - #[cfg(unix)] - command.process_group(0); - - command.env("PYTHONUNBUFFERED", "1"); - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .wrap_err_with(move || { - format!( - "failed to run `{}` with args `{}`", - n.source, - n.args.as_deref().unwrap_or_default(), - ) - })? - } - dora_core::descriptor::CoreNodeKind::Runtime(n) => { - let python_operators: Vec<&OperatorDefinition> = n - .operators - .iter() - .filter(|x| matches!(x.config.source, OperatorSource::Python { .. })) - .collect(); - - let other_operators = n - .operators - .iter() - .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); - - let mut command = if !python_operators.is_empty() && !other_operators { - // Use python to spawn runtime if there is a python operator - - // TODO: Handle multi-operator runtime once sub-interpreter is supported - if python_operators.len() > 2 { - eyre::bail!( - "Runtime currently only support one Python Operator. + command.env("PYTHONUNBUFFERED", "1"); + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + }; + + let error_msg = format!( + "failed to run `{}` with args `{}`", + n.path, + n.args.as_deref().unwrap_or_default(), + ); + (command, error_msg) + } + dora_core::descriptor::CoreNodeKind::Runtime(n) => { + let python_operators: Vec<&OperatorDefinition> = n + .operators + .iter() + .filter(|x| matches!(x.config.source, OperatorSource::Python { .. })) + .collect(); + + let other_operators = n + .operators + .iter() + .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); + + let mut command = if !python_operators.is_empty() && !other_operators { + // Use python to spawn runtime if there is a python operator + + // TODO: Handle multi-operator runtime once sub-interpreter is supported + if python_operators.len() > 2 { + eyre::bail!( + "Runtime currently only support one Python Operator. This is because pyo4 sub-interpreter is not yet available. See: https://github.com/PyO4/pyo3/issues/576" - ); - } + ); + } - let python_operator = python_operators - .first() - .context("Runtime had no operators definition.")?; + let python_operator = python_operators + .first() + .context("Runtime had no operators definition.")?; - if let OperatorSource::Python(PythonSource { - source: _, - conda_env: Some(conda_env), - }) = &python_operator.config.source - { - let conda = which::which("conda").context( + if let OperatorSource::Python(PythonSource { + source: _, + conda_env: Some(conda_env), + }) = &python_operator.config.source + { + let conda = which::which("conda").context( "failed to find `conda`, yet a `conda_env` was defined. Make sure that `conda` is available.", - )?; - let mut command = tokio::process::Command::new(conda); - command.args([ - "run", - "-n", - conda_env, - "python", - "-uc", - format!("import dora; dora.start_runtime() # {}", node.id).as_str(), - ]); - command - } else { - let mut cmd = if uv { - let mut cmd = tokio::process::Command::new("uv"); - cmd.arg("run"); - cmd.arg("python"); - tracing::info!( + )?; + let mut command = tokio::process::Command::new(conda); + command.args([ + "run", + "-n", + conda_env, + "python", + "-uc", + format!("import dora; dora.start_runtime() # {}", node.id).as_str(), + ]); + Some(command) + } else { + let mut cmd = if self.uv { + let mut cmd = tokio::process::Command::new("uv"); + cmd.arg("run"); + cmd.arg("python"); + tracing::info!( "spawning: uv run python -uc import dora; dora.start_runtime() # {}", node.id ); - cmd - } else { + cmd + } else { + let python = get_python_path() + .wrap_err("Could not find python path when spawning custom node")?; + tracing::info!( + "spawning: python -uc import dora; dora.start_runtime() # {}", + node.id + ); + + tokio::process::Command::new(python) + }; + // Force python to always flush stdout/stderr buffer + cmd.args([ + "-uc", + format!("import dora; dora.start_runtime() # {}", node.id).as_str(), + ]); + Some(cmd) + } + } else if python_operators.is_empty() && other_operators { + let current_exe = std::env::current_exe() + .wrap_err("failed to get current executable path")?; + let mut file_name = current_exe.clone(); + file_name.set_extension(""); + let file_name = file_name + .file_name() + .and_then(|s| s.to_str()) + .context("failed to get file name from current executable")?; + + // Check if the current executable is a python binary meaning that dora is installed within the python environment + if file_name.ends_with("python") || file_name.ends_with("python3") { + // Use the current executable to spawn runtime let python = get_python_path() .wrap_err("Could not find python path when spawning custom node")?; + let mut cmd = tokio::process::Command::new(python); + tracing::info!( "spawning: python -uc import dora; dora.start_runtime() # {}", node.id ); - tokio::process::Command::new(python) - }; - // Force python to always flush stdout/stderr buffer - cmd.args([ - "-uc", - format!("import dora; dora.start_runtime() # {}", node.id).as_str(), - ]); - cmd - } - } else if python_operators.is_empty() && other_operators { - let current_exe = - std::env::current_exe().wrap_err("failed to get current executable path")?; - let mut file_name = current_exe.clone(); - file_name.set_extension(""); - let file_name = file_name - .file_name() - .and_then(|s| s.to_str()) - .context("failed to get file name from current executable")?; - - // Check if the current executable is a python binary meaning that dora is installed within the python environment - if file_name.ends_with("python") || file_name.ends_with("python3") { - // Use the current executable to spawn runtime - let python = get_python_path() - .wrap_err("Could not find python path when spawning custom node")?; - let mut cmd = tokio::process::Command::new(python); - - tracing::info!( - "spawning: python -uc import dora; dora.start_runtime() # {}", - node.id - ); - - cmd.args([ - "-uc", - format!("import dora; dora.start_runtime() # {}", node.id).as_str(), - ]); - cmd + cmd.args([ + "-uc", + format!("import dora; dora.start_runtime() # {}", node.id).as_str(), + ]); + Some(cmd) + } else { + let mut cmd = tokio::process::Command::new( + std::env::current_exe() + .wrap_err("failed to get current executable path")?, + ); + cmd.arg("runtime"); + Some(cmd) + } } else { - let mut cmd = tokio::process::Command::new( - std::env::current_exe() - .wrap_err("failed to get current executable path")?, - ); - cmd.arg("runtime"); - cmd - } - } else { - bail!( - "Cannot spawn runtime with both Python and non-Python operators. \ + bail!( + "Cannot spawn runtime with both Python and non-Python operators. \ Please use a single operator or ensure that all operators are Python-based." - ); - }; + ); + }; - command.current_dir(working_dir); + let runtime_config = RuntimeConfig { + node: node_config.clone(), + operators: n.operators.clone(), + }; - let runtime_config = RuntimeConfig { - node: node_config.clone(), - operators: n.operators, - }; - command.env( - "DORA_RUNTIME_CONFIG", - serde_yaml::to_string(&runtime_config) - .wrap_err("failed to serialize runtime config")?, - ); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.env { - for (key, value) in envs { - command.env(key, value.to_string()); - } - } - // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C - #[cfg(unix)] - command.process_group(0); - - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .wrap_err(format!( + if let Some(command) = &mut command { + command.current_dir(&node_working_dir); + + command.env( + "DORA_RUNTIME_CONFIG", + serde_yaml::to_string(&runtime_config) + .wrap_err("failed to serialize runtime config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = &node.env { + for (key, value) in envs { + command.env(key, value.to_string()); + } + } + // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C + #[cfg(unix)] + command.process_group(0); + + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + }; + let error_msg = format!( "failed to run runtime {}/{}", runtime_config.node.dataflow_id, runtime_config.node.node_id - ))? - } - }; + ); + (command, error_msg) + } + }; + Ok(PreparedNode { + command, + spawn_error_msg: error_msg, + node_working_dir, + dataflow_id, + node, + node_config, + clock: self.clock, + daemon_tx: self.daemon_tx, + node_stderr_most_recent, + }) + } +} - let pid = crate::ProcessId::new(child.id().context( - "Could not get the pid for the just spawned node and indicate that there is an error", - )?); - logger - .log( - LogLevel::Debug, - Some("spawner".into()), - format!("spawned node with pid {pid:?}"), - ) - .await; +pub struct PreparedNode { + command: Option, + spawn_error_msg: String, + node_working_dir: PathBuf, + dataflow_id: DataflowId, + node: ResolvedNode, + node_config: NodeConfig, + clock: Arc, + daemon_tx: mpsc::Sender>, + node_stderr_most_recent: Arc>, +} + +impl PreparedNode { + pub fn node_id(&self) -> &NodeId { + &self.node.id + } - let dataflow_dir: PathBuf = working_dir.join("out").join(dataflow_id.to_string()); - if !dataflow_dir.exists() { - std::fs::create_dir_all(&dataflow_dir).context("could not create dataflow_dir")?; + pub fn dynamic(&self) -> bool { + self.node.kind.dynamic() } - let (tx, mut rx) = mpsc::channel(10); - let mut file = File::create(log::log_path(working_dir, &dataflow_id, &node_id)) + + pub async fn spawn(mut self, logger: &mut NodeLogger<'_>) -> eyre::Result { + let mut child = match &mut self.command { + Some(command) => command.spawn().wrap_err(self.spawn_error_msg)?, + None => { + return Ok(RunningNode { + pid: None, + node_config: self.node_config, + }) + } + }; + + let pid = crate::ProcessId::new(child.id().context( + "Could not get the pid for the just spawned node and indicate that there is an error", + )?); + logger + .log( + LogLevel::Debug, + Some("spawner".into()), + format!("spawned node with pid {pid:?}"), + ) + .await; + + let dataflow_dir: PathBuf = self + .node_working_dir + .join("out") + .join(self.dataflow_id.to_string()); + if !dataflow_dir.exists() { + std::fs::create_dir_all(&dataflow_dir).context("could not create dataflow_dir")?; + } + let (tx, mut rx) = mpsc::channel(10); + let mut file = File::create(log::log_path( + &self.node_working_dir, + &self.dataflow_id, + &self.node.id, + )) .await .expect("Failed to create log file"); - let mut child_stdout = - tokio::io::BufReader::new(child.stdout.take().expect("failed to take stdout")); - let running_node = RunningNode { - pid: Some(pid), - node_config, - }; - let stdout_tx = tx.clone(); - let node_id = node.id.clone(); - // Stdout listener stream - tokio::spawn(async move { - let mut buffer = String::new(); - let mut finished = false; - while !finished { - let mut raw = Vec::new(); - finished = match child_stdout - .read_until(b'\n', &mut raw) - .await - .wrap_err_with(|| format!("failed to read stdout line from spawned node {node_id}")) - { - Ok(0) => true, - Ok(_) => false, - Err(err) => { - tracing::warn!("{err:?}"); - false + let mut child_stdout = + tokio::io::BufReader::new(child.stdout.take().expect("failed to take stdout")); + let running_node = RunningNode { + pid: Some(pid), + node_config: self.node_config, + }; + let stdout_tx = tx.clone(); + let node_id = self.node.id.clone(); + // Stdout listener stream + tokio::spawn(async move { + let mut buffer = String::new(); + let mut finished = false; + while !finished { + let mut raw = Vec::new(); + finished = match child_stdout + .read_until(b'\n', &mut raw) + .await + .wrap_err_with(|| { + format!("failed to read stdout line from spawned node {node_id}") + }) { + Ok(0) => true, + Ok(_) => false, + Err(err) => { + tracing::warn!("{err:?}"); + false + } + }; + + match String::from_utf8(raw) { + Ok(s) => buffer.push_str(&s), + Err(err) => { + let lossy = String::from_utf8_lossy(err.as_bytes()); + tracing::warn!( + "stdout not valid UTF-8 string (node {node_id}): {}: {lossy}", + err.utf8_error() + ); + buffer.push_str(&lossy) + } + }; + + if buffer.contains("TRACE") + || buffer.contains("INFO") + || buffer.contains("DEBUG") + || buffer.contains("WARN") + || buffer.contains("ERROR") + { + // tracing output, potentially multi-line -> keep reading following lines + // until double-newline + if !buffer.ends_with("\n\n") && !finished { + continue; + } } - }; - match String::from_utf8(raw) { - Ok(s) => buffer.push_str(&s), - Err(err) => { - let lossy = String::from_utf8_lossy(err.as_bytes()); - tracing::warn!( - "stdout not valid UTF-8 string (node {node_id}): {}: {lossy}", - err.utf8_error() - ); - buffer.push_str(&lossy) + // send the buffered lines + let lines = std::mem::take(&mut buffer); + let sent = stdout_tx.send(lines.clone()).await; + if sent.is_err() { + println!("Could not log: {lines}"); } + } + }); + + let mut child_stderr = + tokio::io::BufReader::new(child.stderr.take().expect("failed to take stderr")); + + // Stderr listener stream + let stderr_tx = tx.clone(); + let node_id = self.node.id.clone(); + let uhlc = self.clock.clone(); + let daemon_tx_log = self.daemon_tx.clone(); + tokio::spawn(async move { + let mut buffer = String::new(); + let mut finished = false; + while !finished { + let mut raw = Vec::new(); + finished = match child_stderr + .read_until(b'\n', &mut raw) + .await + .wrap_err_with(|| { + format!("failed to read stderr line from spawned node {node_id}") + }) { + Ok(0) => true, + Ok(_) => false, + Err(err) => { + tracing::warn!("{err:?}"); + true + } + }; + + let new = match String::from_utf8(raw) { + Ok(s) => s, + Err(err) => { + let lossy = String::from_utf8_lossy(err.as_bytes()); + tracing::warn!( + "stderr not valid UTF-8 string (node {node_id}): {}: {lossy}", + err.utf8_error() + ); + lossy.into_owned() + } + }; + + buffer.push_str(&new); + + self.node_stderr_most_recent.force_push(new); + + // send the buffered lines + let lines = std::mem::take(&mut buffer); + let sent = stderr_tx.send(lines.clone()).await; + if sent.is_err() { + println!("Could not log: {lines}"); + } + } + }); + + let node_id = self.node.id.clone(); + let dynamic_node = self.node.kind.dynamic(); + let (log_finish_tx, log_finish_rx) = oneshot::channel(); + let clock = self.clock.clone(); + let daemon_tx = self.daemon_tx.clone(); + let dataflow_id = self.dataflow_id; + tokio::spawn(async move { + let exit_status = NodeExitStatus::from(child.wait().await); + let _ = log_finish_rx.await; + let event = DoraEvent::SpawnedNodeResult { + dataflow_id, + node_id, + exit_status, + dynamic_node, + } + .into(); + let event = Timestamped { + inner: event, + timestamp: clock.new_timestamp(), }; + let _ = daemon_tx.send(event).await; + }); + + let node_id = self.node.id.clone(); + let daemon_id = logger.inner().inner().daemon_id().clone(); + let mut cloned_logger = logger + .inner() + .inner() + .inner() + .try_clone() + .await + .context("failed to clone logger")?; + + let send_stdout_to = self + .node + .send_stdout_as() + .context("Could not resolve `send_stdout_as` configuration")?; + + // Log to file stream. + tokio::spawn(async move { + while let Some(message) = rx.recv().await { + // If log is an output, we're sending the logs to the dataflow + if let Some(stdout_output_name) = &send_stdout_to { + // Convert logs to DataMessage + let array = message.as_str().into_arrow(); + + let array: ArrayData = array.into(); + let total_len = required_data_size(&array); + let mut sample: AVec> = + AVec::__from_elem(128, 0, total_len); + + let type_info = copy_array_into_sample(&mut sample, &array); + + let metadata = Metadata::new(uhlc.new_timestamp(), type_info); + let output_id = OutputId( + node_id.clone(), + DataId::from(stdout_output_name.to_string()), + ); + let event = DoraEvent::Logs { + dataflow_id, + output_id, + metadata, + message: DataMessage::Vec(sample), + } + .into(); + let event = Timestamped { + inner: event, + timestamp: uhlc.new_timestamp(), + }; + let _ = daemon_tx_log.send(event).await; + } - if buffer.contains("TRACE") - || buffer.contains("INFO") - || buffer.contains("DEBUG") - || buffer.contains("WARN") - || buffer.contains("ERROR") - { - // tracing output, potentially multi-line -> keep reading following lines - // until double-newline - if !buffer.ends_with("\n\n") && !finished { - continue; + let _ = file + .write_all(message.as_bytes()) + .await + .map_err(|err| error!("Could not log {message} to file due to {err}")); + let formatted = message.lines().fold(String::default(), |mut output, line| { + output.push_str(line); + output + }); + if std::env::var("DORA_QUIET").is_err() { + cloned_logger + .log(LogMessage { + daemon_id: Some(daemon_id.clone()), + dataflow_id: Some(dataflow_id), + build_id: None, + level: dora_core::build::LogLevelOrStdout::Stdout, + node_id: Some(node_id.clone()), + target: None, + message: formatted, + file: None, + line: None, + module_path: None, + }) + .await; } + // Make sure that all data has been synced to disk. + let _ = file + .sync_all() + .await + .map_err(|err| error!("Could not sync logs to file due to {err}")); } + let _ = log_finish_tx + .send(()) + .map_err(|_| error!("Could not inform that log file thread finished")); + }); + Ok(running_node) + } +} - // send the buffered lines - let lines = std::mem::take(&mut buffer); - let sent = stdout_tx.send(lines.clone()).await; - if sent.is_err() { - println!("Could not log: {lines}"); +async fn path_spawn_command( + working_dir: &Path, + uv: bool, + logger: &mut NodeLogger<'_>, + node: &dora_core::descriptor::CustomNode, + permit_url: bool, +) -> eyre::Result> { + let cmd = match node.path.as_str() { + DYNAMIC_SOURCE => return Ok(None), + SHELL_SOURCE => { + if cfg!(target_os = "windows") { + let mut cmd = tokio::process::Command::new("cmd"); + cmd.args(["/C", &node.args.clone().unwrap_or_default()]); + cmd + } else { + let mut cmd = tokio::process::Command::new("sh"); + cmd.args(["-c", &node.args.clone().unwrap_or_default()]); + cmd } } - }); - - let mut child_stderr = - tokio::io::BufReader::new(child.stderr.take().expect("failed to take stderr")); - - // Stderr listener stream - let stderr_tx = tx.clone(); - let node_id = node.id.clone(); - let uhlc = clock.clone(); - let daemon_tx_log = daemon_tx.clone(); - tokio::spawn(async move { - let mut buffer = String::new(); - let mut finished = false; - while !finished { - let mut raw = Vec::new(); - finished = match child_stderr - .read_until(b'\n', &mut raw) - .await - .wrap_err_with(|| format!("failed to read stderr line from spawned node {node_id}")) - { - Ok(0) => true, - Ok(_) => false, - Err(err) => { - tracing::warn!("{err:?}"); - true + source => { + let resolved_path = if source_is_url(source) { + if !permit_url { + eyre::bail!("URL paths are not supported in this case"); } + // try to download the shared library + let target_dir = Path::new("build"); + download_file(source, target_dir) + .await + .wrap_err("failed to download custom node")? + } else { + resolve_path(source, working_dir) + .wrap_err_with(|| format!("failed to resolve node source `{}`", source))? }; - let new = match String::from_utf8(raw) { - Ok(s) => s, - Err(err) => { - let lossy = String::from_utf8_lossy(err.as_bytes()); - tracing::warn!( - "stderr not valid UTF-8 string (node {node_id}): {}: {lossy}", - err.utf8_error() - ); - lossy.into_owned() + // If extension is .py, use python to run the script + let mut cmd = match resolved_path.extension().map(|ext| ext.to_str()) { + Some(Some("py")) => { + let mut cmd = if uv { + let mut cmd = tokio::process::Command::new("uv"); + cmd.arg("run"); + cmd.arg("python"); + logger + .log( + LogLevel::Info, + Some("spawner".into()), + format!("spawning: uv run python -u {}", resolved_path.display()), + ) + .await; + cmd + } else { + let python = get_python_path() + .wrap_err("Could not find python path when spawning custom node")?; + logger + .log( + LogLevel::Info, + Some("spawner".into()), + format!("spawning: {:?} -u {}", &python, resolved_path.display()), + ) + .await; + + tokio::process::Command::new(python) + }; + // Force python to always flush stdout/stderr buffer + cmd.arg("-u"); + cmd.arg(&resolved_path); + cmd + } + _ => { + logger + .log( + LogLevel::Info, + Some("spawner".into()), + format!("spawning: {}", resolved_path.display()), + ) + .await; + if uv { + let mut cmd = tokio::process::Command::new("uv"); + cmd.arg("run"); + cmd.arg(&resolved_path); + cmd + } else { + tokio::process::Command::new(&resolved_path) + } } }; - buffer.push_str(&new); - - node_stderr_most_recent.force_push(new); - - // send the buffered lines - let lines = std::mem::take(&mut buffer); - let sent = stderr_tx.send(lines.clone()).await; - if sent.is_err() { - println!("Could not log: {lines}"); + if let Some(args) = &node.args { + cmd.args(args.split_ascii_whitespace()); } + cmd } - }); - - let node_id = node.id.clone(); - let (log_finish_tx, log_finish_rx) = oneshot::channel(); - tokio::spawn(async move { - let exit_status = NodeExitStatus::from(child.wait().await); - let _ = log_finish_rx.await; - let event = DoraEvent::SpawnedNodeResult { - dataflow_id, - node_id, - exit_status, - } - .into(); - let event = Timestamped { - inner: event, - timestamp: clock.new_timestamp(), - }; - let _ = daemon_tx.send(event).await; - }); - - let node_id = node.id.clone(); - let daemon_id = logger.inner().inner().daemon_id().clone(); - let mut cloned_logger = logger - .inner() - .inner() - .inner() - .try_clone() - .await - .context("failed to clone logger")?; - // Log to file stream. - tokio::spawn(async move { - while let Some(message) = rx.recv().await { - // If log is an output, we're sending the logs to the dataflow - if let Some(stdout_output_name) = &send_stdout_to { - // Convert logs to DataMessage - let array = message.as_str().into_arrow(); - - let array: ArrayData = array.into(); - let total_len = required_data_size(&array); - let mut sample: AVec> = AVec::__from_elem(128, 0, total_len); - - let type_info = copy_array_into_sample(&mut sample, &array); - - let metadata = Metadata::new(uhlc.new_timestamp(), type_info); - let output_id = OutputId( - node_id.clone(), - DataId::from(stdout_output_name.to_string()), - ); - let event = DoraEvent::Logs { - dataflow_id, - output_id, - metadata, - message: DataMessage::Vec(sample), - } - .into(); - let event = Timestamped { - inner: event, - timestamp: uhlc.new_timestamp(), - }; - let _ = daemon_tx_log.send(event).await; - } + }; - let _ = file - .write_all(message.as_bytes()) - .await - .map_err(|err| error!("Could not log {message} to file due to {err}")); - let formatted = message.lines().fold(String::default(), |mut output, line| { - output.push_str(line); - output - }); - if std::env::var("DORA_QUIET").is_err() { - cloned_logger - .log(LogMessage { - daemon_id: Some(daemon_id.clone()), - dataflow_id, - level: LogLevel::Info, - node_id: Some(node_id.clone()), - target: Some("stdout".into()), - message: formatted, - file: None, - line: None, - module_path: None, - }) - .await; - } - // Make sure that all data has been synced to disk. - let _ = file - .sync_all() - .await - .map_err(|err| error!("Could not sync logs to file due to {err}")); - } - let _ = log_finish_tx - .send(()) - .map_err(|_| error!("Could not inform that log file thread finished")); - }); - Ok(running_node) + Ok(Some(cmd)) } diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index 270ba264..73e6c615 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -21,7 +21,7 @@ eyre = "0.6.8" futures = "0.3.21" futures-concurrency = "7.1.0" libloading = "0.7.3" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } tokio = { version = "1.24.2", features = ["full"] } tokio-stream = "0.1.8" # pyo3-abi3 flag allow simpler linking. See: https://pyo3.rs/v0.13.2/building_and_distribution.html diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index d4df4b4b..7a42142a 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -43,7 +43,8 @@ pub fn main() -> eyre::Result<()> { .wrap_err("failed to set up tracing subscriber")?; } - let dataflow_descriptor = config.dataflow_descriptor.clone(); + let dataflow_descriptor = serde_yaml::from_value(config.dataflow_descriptor.clone()) + .context("failed to parse dataflow descriptor")?; let operator_definition = if operators.is_empty() { bail!("no operators"); diff --git a/examples/c++-arrow-dataflow/run.rs b/examples/c++-arrow-dataflow/run.rs index 399a73b1..a77c4d78 100644 --- a/examples/c++-arrow-dataflow/run.rs +++ b/examples/c++-arrow-dataflow/run.rs @@ -112,6 +112,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") @@ -136,6 +137,7 @@ async fn build_cxx_node( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] diff --git a/examples/c++-dataflow/.gitignore b/examples/c++-dataflow/.gitignore index 5761abcf..d255f72c 100644 --- a/examples/c++-dataflow/.gitignore +++ b/examples/c++-dataflow/.gitignore @@ -1 +1,2 @@ *.o +/build diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index 6f966e19..dd88900a 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -133,6 +133,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") @@ -157,6 +158,7 @@ async fn build_cxx_node( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] diff --git a/examples/c++-ros2-dataflow/.gitignore b/examples/c++-ros2-dataflow/.gitignore index 5761abcf..d255f72c 100644 --- a/examples/c++-ros2-dataflow/.gitignore +++ b/examples/c++-ros2-dataflow/.gitignore @@ -1 +1,2 @@ *.o +/build diff --git a/examples/c++-ros2-dataflow/run.rs b/examples/c++-ros2-dataflow/run.rs index 918158c2..4af3cd31 100644 --- a/examples/c++-ros2-dataflow/run.rs +++ b/examples/c++-ros2-dataflow/run.rs @@ -90,6 +90,7 @@ async fn build_cxx_node( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] @@ -154,6 +155,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index ad484edf..e71d802b 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") @@ -63,6 +64,7 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] @@ -93,6 +95,8 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-lsynchronization"); clang.arg("-luser32"); clang.arg("-lwinspool"); + clang.arg("-lwinhttp"); + clang.arg("-lrpcrt4"); clang.arg("-Wl,-nodefaultlib:libcmt"); clang.arg("-D_DLL"); @@ -107,6 +111,7 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-l").arg("pthread"); clang.arg("-l").arg("c"); clang.arg("-l").arg("m"); + clang.arg("-l").arg("z"); } clang.arg("-L").arg(root.join("target").join("debug")); clang @@ -161,6 +166,8 @@ async fn build_c_operator(root: &Path) -> eyre::Result<()> { link.arg("-lsynchronization"); link.arg("-luser32"); link.arg("-lwinspool"); + link.arg("-lwinhttp"); + link.arg("-lrpcrt4"); link.arg("-Wl,-nodefaultlib:libcmt"); link.arg("-D_DLL"); diff --git a/examples/camera/run.rs b/examples/camera/run.rs index 9c475c26..94988261 100644 --- a/examples/camera/run.rs +++ b/examples/camera/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/cmake-dataflow/run.rs b/examples/cmake-dataflow/run.rs index 30e3c9d1..b4530f26 100644 --- a/examples/cmake-dataflow/run.rs +++ b/examples/cmake-dataflow/run.rs @@ -61,6 +61,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index 64410a8a..cb558af3 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -1,3 +1,4 @@ +use dora_cli::session::DataflowSession; use dora_coordinator::{ControlEvent, Event}; use dora_core::{ descriptor::{read_as_descriptor, DescriptorExt}, @@ -8,7 +9,7 @@ use dora_message::{ common::DaemonId, coordinator_to_cli::{ControlRequestReply, DataflowIdAndName}, }; -use dora_tracing::set_up_tracing; +use dora_tracing::TracingBuilder; use eyre::{bail, Context}; use std::{ @@ -29,7 +30,9 @@ use uuid::Uuid; #[tokio::main] async fn main() -> eyre::Result<()> { - set_up_tracing("multiple-daemon-runner").wrap_err("failed to set up tracing subscriber")?; + TracingBuilder::new("multiple-daemon-runner") + .with_stdout("debug") + .build()?; let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) @@ -47,12 +50,15 @@ async fn main() -> eyre::Result<()> { IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), DORA_COORDINATOR_PORT_CONTROL_DEFAULT, ); - let (_coordinator_port, coordinator) = dora_coordinator::start( + let (coordinator_port, coordinator) = dora_coordinator::start( coordinator_bind, coordinator_control_bind, ReceiverStream::new(coordinator_events_rx), ) .await?; + + tracing::info!("coordinator running on {coordinator_port}"); + let coordinator_addr = Ipv4Addr::LOCALHOST; let daemon_a = run_daemon(coordinator_addr.to_string(), "A"); let daemon_b = run_daemon(coordinator_addr.to_string(), "B"); @@ -135,12 +141,17 @@ async fn start_dataflow( .check(&working_dir) .wrap_err("could not validate yaml")?; + let dataflow_session = + DataflowSession::read_session(dataflow).context("failed to read DataflowSession")?; + let (reply_sender, reply) = oneshot::channel(); coordinator_events_tx .send(Event::Control(ControlEvent::IncomingRequest { request: ControlRequest::Start { + build_id: dataflow_session.build_id, + session_id: dataflow_session.session_id, dataflow: dataflow_descriptor, - local_working_dir: working_dir, + local_working_dir: Some(working_dir), name: None, uv: false, }, @@ -149,7 +160,21 @@ async fn start_dataflow( .await?; let result = reply.await??; let uuid = match result { - ControlRequestReply::DataflowStarted { uuid } => uuid, + ControlRequestReply::DataflowStartTriggered { uuid } => uuid, + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + }; + + let (reply_sender, reply) = oneshot::channel(); + coordinator_events_tx + .send(Event::Control(ControlEvent::IncomingRequest { + request: ControlRequest::WaitForSpawn { dataflow_id: uuid }, + reply_sender, + })) + .await?; + let result = reply.await??; + let uuid = match result { + ControlRequestReply::DataflowSpawned { uuid } => uuid, ControlRequestReply::Error(err) => bail!("{err}"), other => bail!("unexpected start dataflow reply: {other:?}"), }; @@ -215,6 +240,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -227,6 +253,7 @@ async fn run_daemon(coordinator: String, machine_id: &str) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--machine-id") diff --git a/examples/python-dataflow/run.rs b/examples/python-dataflow/run.rs index 23b254e2..de96795d 100644 --- a/examples/python-dataflow/run.rs +++ b/examples/python-dataflow/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -52,6 +53,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/python-multi-env/run.rs b/examples/python-multi-env/run.rs index 23b254e2..de96795d 100644 --- a/examples/python-multi-env/run.rs +++ b/examples/python-multi-env/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -52,6 +53,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/python-operator-dataflow/run.rs b/examples/python-operator-dataflow/run.rs index 9c475c26..94988261 100644 --- a/examples/python-operator-dataflow/run.rs +++ b/examples/python-operator-dataflow/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/python-ros2-dataflow/run.rs b/examples/python-ros2-dataflow/run.rs index 23b254e2..de96795d 100644 --- a/examples/python-ros2-dataflow/run.rs +++ b/examples/python-ros2-dataflow/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -52,6 +53,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/rerun-viewer/run.rs b/examples/rerun-viewer/run.rs index 4785ba9b..243db076 100644 --- a/examples/rerun-viewer/run.rs +++ b/examples/rerun-viewer/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/rust-dataflow-git/.gitignore b/examples/rust-dataflow-git/.gitignore new file mode 100644 index 00000000..1ac9aaff --- /dev/null +++ b/examples/rust-dataflow-git/.gitignore @@ -0,0 +1,4 @@ +/build +/git + +/dataflow.dora-session.yaml diff --git a/examples/rust-dataflow-git/README.md b/examples/rust-dataflow-git/README.md new file mode 100644 index 00000000..f4d2f3de --- /dev/null +++ b/examples/rust-dataflow-git/README.md @@ -0,0 +1,7 @@ +# Git-based Rust example + +To get started: + +```bash +cargo run --example rust-dataflow-git +``` diff --git a/examples/rust-dataflow-git/dataflow.yml b/examples/rust-dataflow-git/dataflow.yml new file mode 100644 index 00000000..cf06ede2 --- /dev/null +++ b/examples/rust-dataflow-git/dataflow.yml @@ -0,0 +1,29 @@ +nodes: + - id: rust-node + git: https://github.com/dora-rs/dora.git + rev: 64ab0d7c # pinned commit, update this when changing the message crate + build: cargo build -p rust-dataflow-example-node + path: target/debug/rust-dataflow-example-node + inputs: + tick: dora/timer/millis/10 + outputs: + - random + + - id: rust-status-node + git: https://github.com/dora-rs/dora.git + rev: 64ab0d7c # pinned commit, update this when changing the message crate + build: cargo build -p rust-dataflow-example-status-node + path: target/debug/rust-dataflow-example-status-node + inputs: + tick: dora/timer/millis/100 + random: rust-node/random + outputs: + - status + + - id: rust-sink + git: https://github.com/dora-rs/dora.git + rev: 64ab0d7c # pinned commit, update this when changing the message crate + build: cargo build -p rust-dataflow-example-sink + path: target/debug/rust-dataflow-example-sink + inputs: + message: rust-status-node/status diff --git a/examples/rust-dataflow-git/run.rs b/examples/rust-dataflow-git/run.rs new file mode 100644 index 00000000..855eb85b --- /dev/null +++ b/examples/rust-dataflow-git/run.rs @@ -0,0 +1,53 @@ +use dora_tracing::set_up_tracing; +use eyre::{bail, Context}; +use std::path::Path; + +#[tokio::main] +async fn main() -> eyre::Result<()> { + set_up_tracing("rust-dataflow-runner").wrap_err("failed to set up tracing subscriber")?; + + let root = Path::new(env!("CARGO_MANIFEST_DIR")); + std::env::set_current_dir(root.join(file!()).parent().unwrap()) + .wrap_err("failed to set working dir")?; + + let args: Vec = std::env::args().collect(); + let dataflow = if args.len() > 1 { + Path::new(&args[1]) + } else { + Path::new("dataflow.yml") + }; + build_dataflow(dataflow).await?; + + run_dataflow(dataflow).await?; + + Ok(()) +} + +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + +async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); + cmd.arg("--") + .arg("daemon") + .arg("--run-dataflow") + .arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to run dataflow"); + }; + Ok(()) +} diff --git a/examples/rust-dataflow-url/.gitignore b/examples/rust-dataflow-url/.gitignore new file mode 100644 index 00000000..796b96d1 --- /dev/null +++ b/examples/rust-dataflow-url/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index 6f511970..158e8ed9 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -23,6 +23,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -35,6 +36,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 213b65a0..855eb85b 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -16,7 +16,6 @@ async fn main() -> eyre::Result<()> { } else { Path::new("dataflow.yml") }; - build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; @@ -29,6 +28,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -41,6 +41,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/rust-ros2-dataflow/run.rs b/examples/rust-ros2-dataflow/run.rs index a14dce48..f81a25d5 100644 --- a/examples/rust-ros2-dataflow/run.rs +++ b/examples/rust-ros2-dataflow/run.rs @@ -23,6 +23,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -35,6 +36,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/vlm/run.rs b/examples/vlm/run.rs index 1ec38c80..742c3818 100644 --- a/examples/vlm/run.rs +++ b/examples/vlm/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 8ad7952a..41f2e112 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -9,17 +9,25 @@ repository.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +build = ["dep:git2", "dep:url"] + [dependencies] dora-message = { workspace = true } eyre = "0.6.8" serde = { version = "1.0.136", features = ["derive"] } -serde_yaml = "0.9.11" +serde_yaml = { workspace = true } once_cell = "1.13.0" which = "5.0.0" uuid = { version = "1.7", features = ["serde", "v7"] } tracing = "0.1" serde-with-expand-env = "1.1.0" -tokio = { version = "1.24.1", features = ["fs", "process", "sync"] } +tokio = { version = "1.24.1", features = ["fs", "process", "sync", "rt"] } schemars = "0.8.19" serde_json = "1.0.117" log = { version = "0.4.21", features = ["serde"] } +dunce = "1.0.5" +itertools = "0.14" +url = { version = "2.5.4", optional = true } +git2 = { workspace = true, optional = true } +fs_extra = "1.3.0" diff --git a/libraries/core/src/build/build_command.rs b/libraries/core/src/build/build_command.rs new file mode 100644 index 00000000..ef31a6fd --- /dev/null +++ b/libraries/core/src/build/build_command.rs @@ -0,0 +1,83 @@ +use std::{ + collections::BTreeMap, + io::{BufRead, BufReader}, + path::Path, + process::{Command, Stdio}, +}; + +use dora_message::descriptor::EnvValue; +use eyre::{eyre, Context}; + +pub fn run_build_command( + build: &str, + working_dir: &Path, + uv: bool, + envs: &Option>, + stdout_tx: tokio::sync::mpsc::Sender>, +) -> eyre::Result<()> { + let lines = build.lines().collect::>(); + for build_line in lines { + let mut split = build_line.split_whitespace(); + + let program = split + .next() + .ok_or_else(|| eyre!("build command is empty"))?; + let mut cmd = if uv && (program == "pip" || program == "pip3") { + let mut cmd = Command::new("uv"); + cmd.arg("pip"); + cmd + } else { + Command::new(program) + }; + cmd.args(split); + + // Inject Environment Variables + if let Some(envs) = envs { + for (key, value) in envs { + let value = value.to_string(); + cmd.env(key, value); + } + } + + cmd.current_dir(dunce::simplified(working_dir)); + + cmd.stdin(Stdio::null()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + cmd.env("CLICOLOR", "1"); + cmd.env("CLICOLOR_FORCE", "1"); + + let mut child = cmd + .spawn() + .wrap_err_with(|| format!("failed to spawn `{}`", build))?; + + let child_stdout = BufReader::new(child.stdout.take().expect("failed to take stdout")); + let child_stderr = BufReader::new(child.stderr.take().expect("failed to take stderr")); + let stderr_tx = stdout_tx.clone(); + let stdout_tx = stdout_tx.clone(); + + std::thread::spawn(move || { + for line in child_stdout.lines() { + if stdout_tx.blocking_send(line).is_err() { + break; + } + } + }); + std::thread::spawn(move || { + for line in child_stderr.lines() { + if stderr_tx.blocking_send(line).is_err() { + break; + } + } + }); + + let exit_status = cmd + .status() + .wrap_err_with(|| format!("failed to run `{}`", build))?; + if !exit_status.success() { + return Err(eyre!("build command `{build_line}` returned {exit_status}")); + } + } + Ok(()) +} diff --git a/libraries/core/src/build/git.rs b/libraries/core/src/build/git.rs new file mode 100644 index 00000000..7e06f2e0 --- /dev/null +++ b/libraries/core/src/build/git.rs @@ -0,0 +1,374 @@ +use crate::build::{BuildLogger, PrevGitSource}; +use dora_message::{common::LogLevel, DataflowId, SessionId}; +use eyre::{bail, ContextCompat, WrapErr}; +use git2::FetchOptions; +use itertools::Itertools; +use std::{ + collections::{BTreeMap, BTreeSet}, + path::{Path, PathBuf}, +}; +use url::Url; + +#[derive(Default)] +pub struct GitManager { + /// Directories that are currently in use by running dataflows. + pub clones_in_use: BTreeMap>, + /// Builds that are prepared, but not done yet. + prepared_builds: BTreeMap, + reuse_for: BTreeMap, +} + +#[derive(Default)] +struct PreparedBuild { + /// Clone dirs that will be created during the build process. + /// + /// This allows subsequent nodes to reuse the dirs. + planned_clone_dirs: BTreeSet, +} + +impl GitManager { + pub fn choose_clone_dir( + &mut self, + session_id: SessionId, + repo: String, + commit_hash: String, + prev_git: Option, + target_dir: &Path, + ) -> eyre::Result { + let repo_url = Url::parse(&repo).context("failed to parse git repository URL")?; + let clone_dir = Self::clone_dir_path(target_dir, &repo_url, &commit_hash)?; + + let prev_commit_hash = prev_git + .as_ref() + .filter(|p| p.git_source.repo == repo) + .map(|p| &p.git_source.commit_hash); + + if let Some(using) = self.clones_in_use.get(&clone_dir) { + if !using.is_empty() { + // The directory is currently in use by another dataflow. Rebuilding + // while a dataflow is running could lead to unintended behavior. + eyre::bail!( + "the build directory is still in use by the following \ + dataflows, please stop them before rebuilding: {}", + using.iter().join(", ") + ) + } + } + + let reuse = if self.clone_dir_ready(session_id, &clone_dir) { + // The directory already contains a checkout of the commit we're interested in. + // So we can simply reuse the directory without doing any additional git + // operations. + ReuseOptions::Reuse { + dir: clone_dir.clone(), + } + } else if let Some(previous_commit_hash) = prev_commit_hash { + // we might be able to update a previous clone + let prev_clone_dir = Self::clone_dir_path(target_dir, &repo_url, previous_commit_hash)?; + + if prev_clone_dir.exists() { + let still_needed = prev_git + .map(|g| g.still_needed_for_this_build) + .unwrap_or(false); + let used_by_others = self + .clones_in_use + .get(&prev_clone_dir) + .map(|ids| !ids.is_empty()) + .unwrap_or(false); + if still_needed || used_by_others { + // previous clone is still in use -> we cannot rename it, but we can copy it + ReuseOptions::CopyAndFetch { + from: prev_clone_dir, + target_dir: clone_dir.clone(), + commit_hash, + } + } else { + // there is an unused previous clone that is no longer needed -> rename it + ReuseOptions::RenameAndFetch { + from: prev_clone_dir, + target_dir: clone_dir.clone(), + commit_hash, + } + } + } else { + // no existing clone associated with previous build id + ReuseOptions::NewClone { + target_dir: clone_dir.clone(), + repo_url, + commit_hash, + } + } + } else { + // no previous build that we can reuse + ReuseOptions::NewClone { + target_dir: clone_dir.clone(), + repo_url, + commit_hash, + } + }; + self.register_ready_clone_dir(session_id, clone_dir); + + Ok(GitFolder { reuse }) + } + + pub fn in_use(&self, dir: &Path) -> bool { + self.clones_in_use + .get(dir) + .map(|ids| !ids.is_empty()) + .unwrap_or(false) + } + + pub fn clone_dir_ready(&self, session_id: SessionId, dir: &Path) -> bool { + self.prepared_builds + .get(&session_id) + .map(|p| p.planned_clone_dirs.contains(dir)) + .unwrap_or(false) + || dir.exists() + } + + pub fn register_ready_clone_dir(&mut self, session_id: SessionId, dir: PathBuf) -> bool { + self.prepared_builds + .entry(session_id) + .or_default() + .planned_clone_dirs + .insert(dir) + } + + fn clone_dir_path( + base_dir: &Path, + repo_url: &Url, + commit_hash: &String, + ) -> eyre::Result { + let mut path = base_dir.join(repo_url.host_str().context("git URL has no hostname")?); + path.extend(repo_url.path_segments().context("no path in git URL")?); + let path = path.join(commit_hash); + Ok(dunce::simplified(&path).to_owned()) + } + + pub fn clear_planned_builds(&mut self, session_id: SessionId) { + self.prepared_builds.remove(&session_id); + } +} + +pub struct GitFolder { + /// Specifies whether an existing repo should be reused. + reuse: ReuseOptions, +} + +impl GitFolder { + pub async fn prepare(self, logger: &mut impl BuildLogger) -> eyre::Result { + let GitFolder { reuse } = self; + + tracing::info!("reuse: {reuse:?}"); + let clone_dir = match reuse { + ReuseOptions::NewClone { + target_dir, + repo_url, + commit_hash, + } => { + logger + .log_message( + LogLevel::Info, + format!( + "cloning {repo_url}#{commit_hash} into {}", + target_dir.display() + ), + ) + .await; + let clone_target = target_dir.clone(); + let checkout_result = tokio::task::spawn_blocking(move || { + let repository = clone_into(repo_url.clone(), &clone_target) + .with_context(|| format!("failed to clone git repo from `{repo_url}`"))?; + checkout_tree(&repository, &commit_hash) + .with_context(|| format!("failed to checkout commit `{commit_hash}`")) + }) + .await + .unwrap(); + + match checkout_result { + Ok(()) => target_dir, + Err(err) => { + logger + .log_message(LogLevel::Error, format!("{err:?}")) + .await; + // remove erroneous clone again + if let Err(err) = std::fs::remove_dir_all(target_dir) { + logger + .log_message( + LogLevel::Error, + format!( + "failed to remove clone dir after clone/checkout error: {}", + err.kind() + ), + ) + .await; + } + bail!(err) + } + } + } + ReuseOptions::CopyAndFetch { + from, + target_dir, + commit_hash, + } => { + let from_clone = from.clone(); + let to = target_dir.clone(); + tokio::task::spawn_blocking(move || { + std::fs::create_dir_all(&to) + .context("failed to create directory for copying git repo")?; + fs_extra::dir::copy( + &from_clone, + &to, + &fs_extra::dir::CopyOptions::new().content_only(true), + ) + .with_context(|| { + format!( + "failed to copy repo clone from `{}` to `{}`", + from_clone.display(), + to.display() + ) + }) + }) + .await??; + + logger + .log_message( + LogLevel::Info, + format!("fetching changes after copying {}", from.display()), + ) + .await; + + let repository = fetch_changes(&target_dir, None).await?; + checkout_tree(&repository, &commit_hash)?; + target_dir + } + ReuseOptions::RenameAndFetch { + from, + target_dir, + commit_hash, + } => { + tokio::fs::rename(&from, &target_dir) + .await + .context("failed to rename repo clone")?; + + logger + .log_message( + LogLevel::Info, + format!("fetching changes after renaming {}", from.display()), + ) + .await; + + let repository = fetch_changes(&target_dir, None).await?; + checkout_tree(&repository, &commit_hash)?; + target_dir + } + ReuseOptions::Reuse { dir } => { + logger + .log_message( + LogLevel::Info, + format!("reusing up-to-date {}", dir.display()), + ) + .await; + dir + } + }; + Ok(clone_dir) + } +} + +#[derive(Debug)] +enum ReuseOptions { + /// Create a new clone of the repository. + NewClone { + target_dir: PathBuf, + repo_url: Url, + commit_hash: String, + }, + /// Reuse an existing up-to-date clone of the repository. + Reuse { dir: PathBuf }, + /// Copy an older clone of the repository and fetch changes, then reuse it. + CopyAndFetch { + from: PathBuf, + target_dir: PathBuf, + commit_hash: String, + }, + /// Rename an older clone of the repository and fetch changes, then reuse it. + RenameAndFetch { + from: PathBuf, + target_dir: PathBuf, + commit_hash: String, + }, +} + +fn clone_into(repo_addr: Url, clone_dir: &Path) -> eyre::Result { + if let Some(parent) = clone_dir.parent() { + std::fs::create_dir_all(parent) + .context("failed to create parent directory for git clone")?; + } + + let clone_dir = clone_dir.to_owned(); + + let mut builder = git2::build::RepoBuilder::new(); + let mut fetch_options = git2::FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + builder.fetch_options(fetch_options); + builder + .clone(repo_addr.as_str(), &clone_dir) + .context("failed to clone repo") +} + +async fn fetch_changes( + repo_dir: &Path, + refname: Option, +) -> Result { + let repo_dir = repo_dir.to_owned(); + let fetch_changes = tokio::task::spawn_blocking(move || { + let repository = git2::Repository::open(&repo_dir).context("failed to open git repo")?; + + { + let mut remote = repository + .find_remote("origin") + .context("failed to find remote `origin` in repo")?; + remote + .connect(git2::Direction::Fetch) + .context("failed to connect to remote")?; + let default_branch = remote + .default_branch() + .context("failed to get default branch for remote")?; + let fetch = match &refname { + Some(refname) => refname, + None => default_branch + .as_str() + .context("failed to read default branch as string")?, + }; + let mut fetch_options = FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + remote + .fetch(&[&fetch], Some(&mut fetch_options), None) + .context("failed to fetch from git repo")?; + } + Result::<_, eyre::Error>::Ok(repository) + }); + let repository = fetch_changes.await??; + Ok(repository) +} + +fn checkout_tree(repository: &git2::Repository, commit_hash: &str) -> eyre::Result<()> { + let (object, reference) = repository + .revparse_ext(commit_hash) + .context("failed to parse ref")?; + repository + .checkout_tree(&object, None) + .context("failed to checkout ref")?; + match reference { + Some(reference) => repository + .set_head(reference.name().context("failed to get reference_name")?) + .context("failed to set head")?, + None => repository + .set_head_detached(object.id()) + .context("failed to set detached head")?, + } + + Ok(()) +} diff --git a/libraries/core/src/build/logger.rs b/libraries/core/src/build/logger.rs new file mode 100644 index 00000000..c382b1ac --- /dev/null +++ b/libraries/core/src/build/logger.rs @@ -0,0 +1,19 @@ +use std::future::Future; + +pub use dora_message::common::LogLevelOrStdout; + +pub trait BuildLogger: Send { + type Clone: BuildLogger + 'static; + + fn log_message( + &mut self, + level: impl Into + Send, + message: impl Into + Send, + ) -> impl Future + Send; + + fn log_stdout(&mut self, message: impl Into + Send) -> impl Future + Send { + self.log_message(LogLevelOrStdout::Stdout, message) + } + + fn try_clone(&self) -> impl Future> + Send; +} diff --git a/libraries/core/src/build/mod.rs b/libraries/core/src/build/mod.rs new file mode 100644 index 00000000..5e7193d5 --- /dev/null +++ b/libraries/core/src/build/mod.rs @@ -0,0 +1,148 @@ +pub use git::GitManager; +pub use logger::{BuildLogger, LogLevelOrStdout}; + +use url::Url; + +use std::{collections::BTreeMap, future::Future, path::PathBuf}; + +use crate::descriptor::ResolvedNode; +use dora_message::{ + common::{GitSource, LogLevel}, + descriptor::{CoreNodeKind, EnvValue}, + id::NodeId, + SessionId, +}; +use eyre::Context; + +use build_command::run_build_command; +use git::GitFolder; + +mod build_command; +mod git; +mod logger; + +#[derive(Clone)] +pub struct Builder { + pub session_id: SessionId, + pub base_working_dir: PathBuf, + pub uv: bool, +} + +impl Builder { + pub async fn build_node( + self, + node: ResolvedNode, + git: Option, + prev_git: Option, + mut logger: impl BuildLogger, + git_manager: &mut GitManager, + ) -> eyre::Result>> { + let prepared_git = if let Some(GitSource { repo, commit_hash }) = git { + let target_dir = self.base_working_dir.join("git"); + let git_folder = git_manager.choose_clone_dir( + self.session_id, + repo, + commit_hash, + prev_git, + &target_dir, + )?; + Some(git_folder) + } else { + None + }; + + let task = async move { self.build_node_inner(node, &mut logger, prepared_git).await }; + Ok(task) + } + + async fn build_node_inner( + self, + node: ResolvedNode, + logger: &mut impl BuildLogger, + git_folder: Option, + ) -> eyre::Result { + logger.log_message(LogLevel::Debug, "building node").await; + let node_working_dir = match &node.kind { + CoreNodeKind::Custom(n) => { + let node_working_dir = match git_folder { + Some(git_folder) => { + let clone_dir = git_folder.prepare(logger).await?; + tracing::warn!( + "using git clone directory as working dir: \ + this behavior is unstable and might change \ + (see https://github.com/dora-rs/dora/pull/901)" + ); + clone_dir + } + None => self.base_working_dir, + }; + + if let Some(build) = &n.build { + build_node(logger, &node.env, node_working_dir.clone(), build, self.uv).await?; + } + node_working_dir + } + CoreNodeKind::Runtime(n) => { + // run build commands + for operator in &n.operators { + if let Some(build) = &operator.config.build { + build_node( + logger, + &node.env, + self.base_working_dir.clone(), + build, + self.uv, + ) + .await?; + } + } + self.base_working_dir.clone() + } + }; + Ok(BuiltNode { node_working_dir }) + } +} + +async fn build_node( + logger: &mut impl BuildLogger, + node_env: &Option>, + working_dir: PathBuf, + build: &String, + uv: bool, +) -> eyre::Result<()> { + logger + .log_message(LogLevel::Info, format!("running build command: `{build}")) + .await; + let build = build.to_owned(); + let node_env = node_env.clone(); + let mut logger = logger.try_clone().await.context("failed to clone logger")?; + let (stdout_tx, mut stdout) = tokio::sync::mpsc::channel(10); + let task = tokio::task::spawn_blocking(move || { + run_build_command(&build, &working_dir, uv, &node_env, stdout_tx) + .context("build command failed") + }); + tokio::spawn(async move { + while let Some(line) = stdout.recv().await { + logger + .log_stdout(line.unwrap_or_else(|err| format!("io err: {}", err.kind()))) + .await; + } + }); + task.await??; + Ok(()) +} + +pub struct BuiltNode { + pub node_working_dir: PathBuf, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct BuildInfo { + pub node_working_dirs: BTreeMap, +} + +pub struct PrevGitSource { + pub git_source: GitSource, + /// `True` if any nodes of this dataflow still require the source for building. + pub still_needed_for_this_build: bool, +} diff --git a/libraries/core/src/descriptor/mod.rs b/libraries/core/src/descriptor/mod.rs index cb8860fa..c3cd910a 100644 --- a/libraries/core/src/descriptor/mod.rs +++ b/libraries/core/src/descriptor/mod.rs @@ -1,5 +1,6 @@ use dora_message::{ config::{Input, InputMapping, NodeRunConfig}, + descriptor::{GitRepoRev, NodeSource}, id::{DataId, NodeId, OperatorId}, }; use eyre::{bail, Context, OptionExt, Result}; @@ -53,7 +54,7 @@ impl DescriptorExt for Descriptor { // adjust input mappings let mut node_kind = node_kind_mut(&mut node)?; let input_mappings: Vec<_> = match &mut node_kind { - NodeKindMut::Standard { path: _, inputs } => inputs.values_mut().collect(), + NodeKindMut::Standard { inputs, .. } => inputs.values_mut().collect(), NodeKindMut::Runtime(node) => node .operators .iter_mut() @@ -76,8 +77,13 @@ impl DescriptorExt for Descriptor { // resolve nodes let kind = match node_kind { - NodeKindMut::Standard { path, inputs: _ } => CoreNodeKind::Custom(CustomNode { - source: path.clone(), + NodeKindMut::Standard { + path, + source, + inputs: _, + } => CoreNodeKind::Custom(CustomNode { + path: path.clone(), + source, args: node.args, build: node.build, send_stdout_as: node.send_stdout_as, @@ -149,14 +155,35 @@ pub async fn read_as_descriptor(path: &Path) -> eyre::Result { fn node_kind_mut(node: &mut Node) -> eyre::Result { match node.kind()? { - NodeKind::Standard(_) => node - .path - .as_ref() - .map(|path| NodeKindMut::Standard { - path, + NodeKind::Standard(_) => { + let source = match (&node.git, &node.branch, &node.tag, &node.rev) { + (None, None, None, None) => NodeSource::Local, + (Some(repo), branch, tag, rev) => { + let rev = match (branch, tag, rev) { + (None, None, None) => None, + (Some(branch), None, None) => Some(GitRepoRev::Branch(branch.clone())), + (None, Some(tag), None) => Some(GitRepoRev::Tag(tag.clone())), + (None, None, Some(rev)) => Some(GitRepoRev::Rev(rev.clone())), + other @ (_, _, _) => { + eyre::bail!("only one of `branch`, `tag`, and `rev` are allowed (got {other:?})") + } + }; + NodeSource::GitBranch { + repo: repo.clone(), + rev, + } + } + (None, _, _, _) => { + eyre::bail!("`git` source required when using branch, tag, or rev") + } + }; + + Ok(NodeKindMut::Standard { + path: node.path.as_ref().ok_or_eyre("missing `path` attribute")?, + source, inputs: &mut node.inputs, }) - .ok_or_eyre("no path"), + } NodeKind::Runtime(_) => node .operators .as_mut() @@ -249,6 +276,7 @@ pub enum NodeKind<'a> { enum NodeKindMut<'a> { Standard { path: &'a String, + source: NodeSource, inputs: &'a mut BTreeMap, }, /// Dora runtime node diff --git a/libraries/core/src/descriptor/validate.rs b/libraries/core/src/descriptor/validate.rs index c28bd451..f68979cd 100644 --- a/libraries/core/src/descriptor/validate.rs +++ b/libraries/core/src/descriptor/validate.rs @@ -28,23 +28,34 @@ pub fn check_dataflow( // check that nodes and operators exist for node in nodes.values() { match &node.kind { - descriptor::CoreNodeKind::Custom(custom) => match custom.source.as_str() { - SHELL_SOURCE => (), - DYNAMIC_SOURCE => (), - source => { - if source_is_url(source) { - info!("{source} is a URL."); // TODO: Implement url check. - } else if let Some(remote_daemon_id) = remote_daemon_id { - if let Some(machine) = &node.deploy.machine { - if remote_daemon_id.contains(&machine.as_str()) || coordinator_is_remote - { - info!("skipping path check for remote node `{}`", node.id); + descriptor::CoreNodeKind::Custom(custom) => match &custom.source { + dora_message::descriptor::NodeSource::Local => match custom.path.as_str() { + SHELL_SOURCE => (), + DYNAMIC_SOURCE => (), + source => { + if source_is_url(source) { + info!("{source} is a URL."); // TODO: Implement url check. + } else if let Some(remote_daemon_id) = remote_daemon_id { + if let Some(deploy) = &node.deploy { + if let Some(machine) = &deploy.machine { + if remote_daemon_id.contains(&machine.as_str()) + || coordinator_is_remote + { + info!("skipping path check for remote node `{}`", node.id); + } + } } - } - } else { - resolve_path(source, working_dir) - .wrap_err_with(|| format!("Could not find source path `{}`", source))?; - }; + } else if custom.build.is_some() { + info!("skipping path check for node with build command"); + } else { + resolve_path(source, working_dir).wrap_err_with(|| { + format!("Could not find source path `{}`", source) + })?; + }; + } + }, + dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { + info!("skipping check for node with git source"); } }, descriptor::CoreNodeKind::Runtime(node) => { @@ -53,6 +64,8 @@ pub fn check_dataflow( OperatorSource::SharedLibrary(path) => { if source_is_url(path) { info!("{path} is a URL."); // TODO: Implement url check. + } else if operator_definition.config.build.is_some() { + info!("skipping path check for operator with build command"); } else { let path = adjust_shared_library_path(Path::new(&path))?; if !working_dir.join(&path).exists() { diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index c7e7cd6c..c45ec613 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -7,6 +7,8 @@ use std::{ pub use dora_message::{config, uhlc}; +#[cfg(feature = "build")] +pub mod build; pub mod descriptor; pub mod metadata; pub mod topics; diff --git a/libraries/message/Cargo.toml b/libraries/message/Cargo.toml index 7bb3f673..3bcc71fe 100644 --- a/libraries/message/Cargo.toml +++ b/libraries/message/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "dora-message" # versioned separately from the other dora crates -version = "0.4.4" +version = "0.5.0-alpha" edition.workspace = true documentation.workspace = true description.workspace = true @@ -23,7 +23,7 @@ aligned-vec = { version = "0.5.0", features = ["serde"] } semver = { version = "1.0.23", features = ["serde"] } schemars = "0.8.19" uhlc = "0.5.1" -serde_yaml = "0.9.11" +serde_yaml = { workspace = true } once_cell = "1.13.0" serde-with-expand-env = "1.1.0" bincode = "1.3.3" diff --git a/libraries/message/src/cli_to_coordinator.rs b/libraries/message/src/cli_to_coordinator.rs index 1b62fd58..bf3d3a03 100644 --- a/libraries/message/src/cli_to_coordinator.rs +++ b/libraries/message/src/cli_to_coordinator.rs @@ -1,22 +1,52 @@ -use std::{path::PathBuf, time::Duration}; +use std::{collections::BTreeMap, path::PathBuf, time::Duration}; use uuid::Uuid; use crate::{ + common::GitSource, descriptor::Descriptor, id::{NodeId, OperatorId}, + BuildId, SessionId, }; -#[derive(Debug, serde::Deserialize, serde::Serialize)] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub enum ControlRequest { + Build { + session_id: SessionId, + dataflow: Descriptor, + git_sources: BTreeMap, + prev_git_sources: BTreeMap, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + local_working_dir: Option, + uv: bool, + }, + WaitForBuild { + build_id: BuildId, + }, Start { + build_id: Option, + session_id: SessionId, dataflow: Descriptor, name: Option, - // TODO: remove this once we figure out deploying of node/operator - // binaries from CLI to coordinator/daemon - local_working_dir: PathBuf, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + local_working_dir: Option, uv: bool, }, + WaitForSpawn { + dataflow_id: Uuid, + }, Reload { dataflow_id: Uuid, node_id: NodeId, @@ -46,4 +76,9 @@ pub enum ControlRequest { dataflow_id: Uuid, level: log::LevelFilter, }, + BuildLogSubscribe { + build_id: BuildId, + level: log::LevelFilter, + }, + CliAndDefaultDaemonOnSameMachine, } diff --git a/libraries/message/src/common.rs b/libraries/message/src/common.rs index 93e2f8d9..d48f1308 100644 --- a/libraries/message/src/common.rs +++ b/libraries/message/src/common.rs @@ -5,17 +5,18 @@ use aligned_vec::{AVec, ConstAlign}; use eyre::Context as _; use uuid::Uuid; -use crate::{daemon_to_daemon::InterDaemonEvent, id::NodeId, DataflowId}; +use crate::{daemon_to_daemon::InterDaemonEvent, id::NodeId, BuildId, DataflowId}; pub use log::Level as LogLevel; #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[must_use] pub struct LogMessage { - pub dataflow_id: DataflowId, + pub build_id: Option, + pub dataflow_id: Option, pub node_id: Option, pub daemon_id: Option, - pub level: LogLevel, + pub level: LogLevelOrStdout, pub target: Option, pub module_path: Option, pub file: Option, @@ -23,6 +24,18 @@ pub struct LogMessage { pub message: String, } +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub enum LogLevelOrStdout { + LogLevel(LogLevel), + Stdout, +} + +impl From for LogLevelOrStdout { + fn from(level: LogLevel) -> Self { + Self::LogLevel(level) + } +} + #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub struct NodeError { pub timestamp: uhlc::Timestamp, @@ -32,6 +45,9 @@ pub struct NodeError { impl std::fmt::Display for NodeError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let NodeErrorCause::FailedToSpawn(err) = &self.cause { + return write!(f, "failed to spawn node: {err}"); + } match &self.exit_status { NodeExitStatus::Success => write!(f, ""), NodeExitStatus::IoError(err) => write!(f, "I/O error while reading exit status: {err}"), @@ -68,6 +84,7 @@ impl std::fmt::Display for NodeError { f, ". This error occurred because node `{caused_by_node}` exited before connecting to dora." )?, + NodeErrorCause::FailedToSpawn(_) => unreachable!(), // handled above NodeErrorCause::Other { stderr } if stderr.is_empty() => {} NodeErrorCause::Other { stderr } => { let line: &str = "---------------------------------------------------------------------------------\n"; @@ -88,6 +105,7 @@ pub enum NodeErrorCause { Cascading { caused_by_node: NodeId, }, + FailedToSpawn(String), Other { stderr: String, }, @@ -234,3 +252,9 @@ impl std::fmt::Display for DaemonId { write!(f, "{}", self.uuid) } } + +#[derive(Debug, serde::Deserialize, serde::Serialize, Clone, PartialEq, Eq)] +pub struct GitSource { + pub repo: String, + pub commit_hash: String, +} diff --git a/libraries/message/src/coordinator_to_cli.rs b/libraries/message/src/coordinator_to_cli.rs index c8f1d3c8..02243468 100644 --- a/libraries/message/src/coordinator_to_cli.rs +++ b/libraries/message/src/coordinator_to_cli.rs @@ -1,22 +1,46 @@ -use std::collections::{BTreeMap, BTreeSet}; +use std::{ + collections::{BTreeMap, BTreeSet}, + net::IpAddr, +}; use uuid::Uuid; pub use crate::common::{LogLevel, LogMessage, NodeError, NodeErrorCause, NodeExitStatus}; -use crate::{common::DaemonId, id::NodeId}; +use crate::{common::DaemonId, id::NodeId, BuildId}; #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub enum ControlRequestReply { Error(String), CoordinatorStopped, - DataflowStarted { uuid: Uuid }, - DataflowReloaded { uuid: Uuid }, - DataflowStopped { uuid: Uuid, result: DataflowResult }, + DataflowBuildTriggered { + build_id: BuildId, + }, + DataflowBuildFinished { + build_id: BuildId, + result: Result<(), String>, + }, + DataflowStartTriggered { + uuid: Uuid, + }, + DataflowSpawned { + uuid: Uuid, + }, + DataflowReloaded { + uuid: Uuid, + }, + DataflowStopped { + uuid: Uuid, + result: DataflowResult, + }, DataflowList(DataflowList), DestroyOk, DaemonConnected(bool), ConnectedDaemons(BTreeSet), Logs(Vec), + CliAndDefaultDaemonIps { + default_daemon: Option, + cli: Option, + }, } #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] diff --git a/libraries/message/src/coordinator_to_daemon.rs b/libraries/message/src/coordinator_to_daemon.rs index 482f0042..69da8923 100644 --- a/libraries/message/src/coordinator_to_daemon.rs +++ b/libraries/message/src/coordinator_to_daemon.rs @@ -5,10 +5,10 @@ use std::{ }; use crate::{ - common::DaemonId, + common::{DaemonId, GitSource}, descriptor::{Descriptor, ResolvedNode}, id::{NodeId, OperatorId}, - DataflowId, + BuildId, DataflowId, SessionId, }; pub use crate::common::Timestamped; @@ -33,6 +33,7 @@ impl RegisterResult { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorEvent { + Build(BuildDataflowNodes), Spawn(SpawnDataflowNodes), AllNodesReady { dataflow_id: DataflowId, @@ -55,10 +56,38 @@ pub enum DaemonCoordinatorEvent { Heartbeat, } +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct BuildDataflowNodes { + pub build_id: BuildId, + pub session_id: SessionId, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + pub local_working_dir: Option, + pub git_sources: BTreeMap, + pub prev_git_sources: BTreeMap, + pub dataflow_descriptor: Descriptor, + pub nodes_on_machine: BTreeSet, + pub uv: bool, +} + #[derive(Debug, serde::Deserialize, serde::Serialize)] pub struct SpawnDataflowNodes { + pub build_id: Option, + pub session_id: SessionId, pub dataflow_id: DataflowId, - pub working_dir: PathBuf, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + pub local_working_dir: Option, pub nodes: BTreeMap, pub dataflow_descriptor: Descriptor, pub spawn_nodes: BTreeSet, diff --git a/libraries/message/src/daemon_to_coordinator.rs b/libraries/message/src/daemon_to_coordinator.rs index 22bd0e5f..ccafb0a5 100644 --- a/libraries/message/src/daemon_to_coordinator.rs +++ b/libraries/message/src/daemon_to_coordinator.rs @@ -3,7 +3,9 @@ use std::collections::BTreeMap; pub use crate::common::{ DataMessage, LogLevel, LogMessage, NodeError, NodeErrorCause, NodeExitStatus, Timestamped, }; -use crate::{common::DaemonId, current_crate_version, id::NodeId, versions_compatible, DataflowId}; +use crate::{ + common::DaemonId, current_crate_version, id::NodeId, versions_compatible, BuildId, DataflowId, +}; #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum CoordinatorRequest { @@ -46,6 +48,14 @@ impl DaemonRegisterRequest { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum DaemonEvent { + BuildResult { + build_id: BuildId, + result: Result<(), String>, + }, + SpawnResult { + dataflow_id: DataflowId, + result: Result<(), String>, + }, AllNodesReady { dataflow_id: DataflowId, exited_before_subscribe: Vec, @@ -73,7 +83,8 @@ impl DataflowDaemonResult { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorReply { - SpawnResult(Result<(), String>), + TriggerBuildResult(Result<(), String>), + TriggerSpawnResult(Result<(), String>), ReloadResult(Result<(), String>), StopResult(Result<(), String>), DestroyResult { diff --git a/libraries/message/src/daemon_to_node.rs b/libraries/message/src/daemon_to_node.rs index acc1630e..75c59bba 100644 --- a/libraries/message/src/daemon_to_node.rs +++ b/libraries/message/src/daemon_to_node.rs @@ -2,7 +2,7 @@ use std::{net::SocketAddr, path::PathBuf}; use crate::{ config::NodeRunConfig, - descriptor::{Descriptor, OperatorDefinition}, + descriptor::OperatorDefinition, id::{DataId, NodeId, OperatorId}, metadata::Metadata, DataflowId, @@ -23,7 +23,7 @@ pub struct NodeConfig { pub node_id: NodeId, pub run_config: NodeRunConfig, pub daemon_communication: DaemonCommunication, - pub dataflow_descriptor: Descriptor, + pub dataflow_descriptor: serde_yaml::Value, pub dynamic: bool, } diff --git a/libraries/message/src/descriptor.rs b/libraries/message/src/descriptor.rs index 2fe68760..f6a2ba9c 100644 --- a/libraries/message/src/descriptor.rs +++ b/libraries/message/src/descriptor.rs @@ -23,18 +23,19 @@ pub struct Descriptor { #[serde(default)] pub communication: CommunicationConfig, #[schemars(skip)] - #[serde(default, rename = "_unstable_deploy")] - pub deploy: Deploy, + #[serde(rename = "_unstable_deploy")] + pub deploy: Option, pub nodes: Vec, #[schemars(skip)] #[serde(default, rename = "_unstable_debug")] pub debug: Debug, } -#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(deny_unknown_fields)] pub struct Deploy { pub machine: Option, + pub working_dir: Option, } #[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] @@ -58,8 +59,8 @@ pub struct Node { /// Unstable machine deployment configuration #[schemars(skip)] - #[serde(default, rename = "_unstable_deploy")] - pub deploy: Deploy, + #[serde(rename = "_unstable_deploy")] + pub deploy: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub operators: Option, @@ -70,6 +71,15 @@ pub struct Node { #[serde(default, skip_serializing_if = "Option::is_none")] pub path: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub git: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub branch: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub tag: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rev: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] pub args: Option, #[serde(default, skip_serializing_if = "Option::is_none")] @@ -90,7 +100,7 @@ pub struct ResolvedNode { pub env: Option>, #[serde(default)] - pub deploy: Deploy, + pub deploy: Option, #[serde(flatten)] pub kind: CoreNodeKind, @@ -216,7 +226,8 @@ pub struct CustomNode { /// args: some_node.py /// /// Source can match any executable in PATH. - pub source: String, + pub path: String, + pub source: NodeSource, /// Args for the executable. #[serde(default, skip_serializing_if = "Option::is_none")] pub args: Option, @@ -234,6 +245,28 @@ pub struct CustomNode { pub run_config: NodeRunConfig, } +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum NodeSource { + Local, + GitBranch { + repo: String, + rev: Option, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum ResolvedNodeSource { + Local, + GitCommit { repo: String, commit_hash: String }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum GitRepoRev { + Branch(String), + Tag(String), + Rev(String), +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(untagged)] pub enum EnvValue { diff --git a/libraries/message/src/lib.rs b/libraries/message/src/lib.rs index 9d1870e0..e5e2e33f 100644 --- a/libraries/message/src/lib.rs +++ b/libraries/message/src/lib.rs @@ -24,9 +24,42 @@ pub mod coordinator_to_cli; pub use arrow_data; pub use arrow_schema; +use uuid::{Timestamp, Uuid}; pub type DataflowId = uuid::Uuid; +#[derive( + Debug, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, +)] +pub struct SessionId(uuid::Uuid); + +impl SessionId { + pub fn generate() -> Self { + Self(Uuid::new_v7(Timestamp::now(uuid::NoContext))) + } + + pub fn uuid(&self) -> uuid::Uuid { + self.0 + } +} + +#[derive( + Debug, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, +)] +pub struct BuildId(uuid::Uuid); + +impl BuildId { + pub fn generate() -> Self { + Self(Uuid::new_v7(Timestamp::now(uuid::NoContext))) + } +} + +impl std::fmt::Display for BuildId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "BuildId({})", self.0) + } +} + fn current_crate_version() -> semver::Version { let crate_version_raw = env!("CARGO_PKG_VERSION");