From e8d7bb02fb93774b3c7039ae8457202e423c8ad9 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 12 Mar 2025 15:37:50 +0100 Subject: [PATCH 001/101] Add support for git repo sources for nodes --- Cargo.lock | 301 +++++- binaries/cli/src/build.rs | 64 +- binaries/cli/src/lib.rs | 2 +- binaries/daemon/Cargo.toml | 2 + binaries/daemon/src/lib.rs | 40 +- binaries/daemon/src/spawn.rs | 1196 +++++++++++++-------- libraries/core/src/build.rs | 30 + libraries/core/src/descriptor/mod.rs | 46 +- libraries/core/src/descriptor/validate.rs | 39 +- libraries/core/src/lib.rs | 1 + libraries/message/src/descriptor.rs | 28 +- 11 files changed, 1168 insertions(+), 581 deletions(-) create mode 100644 libraries/core/src/build.rs diff --git a/Cargo.lock b/Cargo.lock index 41f76174..fc821f49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2729,6 +2729,7 @@ dependencies = [ "flume 0.10.14", "futures", "futures-concurrency", + "git2", "serde_json", "serde_yaml 0.8.26", "shared-memory-server", @@ -2737,6 +2738,7 @@ dependencies = [ "tokio-stream", "tracing", "tracing-opentelemetry", + "url", "uuid", "which", "zenoh 1.2.1", @@ -4120,6 +4122,8 @@ dependencies = [ "libc", "libgit2-sys", "log", + "openssl-probe", + "openssl-sys", "url", ] @@ -4784,6 +4788,124 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.94", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -4792,12 +4914,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "icu_normalizer", + "icu_properties", ] [[package]] @@ -5371,7 +5504,9 @@ checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" dependencies = [ "cc", "libc", + "libssh2-sys", "libz-sys", + "openssl-sys", "pkg-config", ] @@ -5422,6 +5557,20 @@ dependencies = [ "libc", ] +[[package]] +name = "libssh2-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + [[package]] name = "libz-sys" version = "1.1.18" @@ -5467,6 +5616,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" + [[package]] name = "litrs" version = "0.4.1" @@ -6586,6 +6741,18 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb61ea9811cc39e3c2069f40b8b8e2e70d8569b361f879786cc7ed48b777cdd" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "opentelemetry" version = "0.18.0" @@ -11107,6 +11274,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -11595,6 +11768,16 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.6.1" @@ -12104,27 +12287,12 @@ dependencies = [ "version_check", ] -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" -[[package]] -name = "unicode-normalization" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] - [[package]] name = "unicode-segmentation" version = "1.11.0" @@ -12226,9 +12394,9 @@ dependencies = [ [[package]] name = "url" -version = "2.5.2" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -12292,6 +12460,18 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" @@ -13397,6 +13577,18 @@ dependencies = [ "syn 2.0.94", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "ws2_32-sys" version = "0.2.1" @@ -13542,6 +13734,30 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.94", + "synstructure", +] + [[package]] name = "zbus" version = "4.4.0" @@ -14589,12 +14805,55 @@ dependencies = [ "syn 2.0.94", ] +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.94", + "synstructure", +] + [[package]] name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.94", +] + [[package]] name = "zstd" version = "0.13.0" diff --git a/binaries/cli/src/build.rs b/binaries/cli/src/build.rs index 9c18f74b..060a5171 100644 --- a/binaries/cli/src/build.rs +++ b/binaries/cli/src/build.rs @@ -1,9 +1,9 @@ use dora_core::{ + build::run_build_command, config::OperatorId, descriptor::{Descriptor, DescriptorExt, NodeExt, SINGLE_OPERATOR_DEFAULT_ID}, }; -use eyre::{eyre, Context}; -use std::{path::Path, process::Command}; +use eyre::Context; use crate::resolve_dataflow; @@ -22,70 +22,44 @@ pub fn build(dataflow: String, uv: bool) -> eyre::Result<()> { for node in descriptor.nodes { match node.kind()? { dora_core::descriptor::NodeKind::Standard(_) => { - run_build_command(node.build.as_deref(), working_dir, uv).with_context(|| { - format!("build command failed for standard node `{}`", node.id) - })? + if let Some(build) = &node.build { + run_build_command(build, working_dir, uv).with_context(|| { + format!("build command failed for standard node `{}`", node.id) + })? + } } dora_core::descriptor::NodeKind::Runtime(runtime_node) => { for operator in &runtime_node.operators { - run_build_command(operator.config.build.as_deref(), working_dir, uv) - .with_context(|| { + if let Some(build) = &operator.config.build { + run_build_command(build, working_dir, uv).with_context(|| { format!( "build command failed for operator `{}/{}`", node.id, operator.id ) })?; + } } } dora_core::descriptor::NodeKind::Custom(custom_node) => { - run_build_command(custom_node.build.as_deref(), working_dir, uv).with_context( - || format!("build command failed for custom node `{}`", node.id), - )? + if let Some(build) = &custom_node.build { + run_build_command(build, working_dir, uv).with_context(|| { + format!("build command failed for custom node `{}`", node.id) + })? + } } dora_core::descriptor::NodeKind::Operator(operator) => { - run_build_command(operator.config.build.as_deref(), working_dir, uv).with_context( - || { + if let Some(build) = &operator.config.build { + run_build_command(build, working_dir, uv).with_context(|| { format!( "build command failed for operator `{}/{}`", node.id, operator.id.as_ref().unwrap_or(&default_op_id) ) - }, - )? + })? + } } } } Ok(()) } - -fn run_build_command(build: Option<&str>, working_dir: &Path, uv: bool) -> eyre::Result<()> { - if let Some(build) = build { - let lines = build.lines().collect::>(); - for build_line in lines { - let mut split = build_line.split_whitespace(); - - let program = split - .next() - .ok_or_else(|| eyre!("build command is empty"))?; - let mut cmd = if uv && (program == "pip" || program == "pip3") { - let mut cmd = Command::new("uv"); - cmd.arg("pip"); - cmd - } else { - Command::new(program) - }; - cmd.args(split); - cmd.current_dir(working_dir); - let exit_status = cmd - .status() - .wrap_err_with(|| format!("failed to run `{}`", build))?; - if !exit_status.success() { - return Err(eyre!("build command `{build_line}` returned {exit_status}")); - } - } - Ok(()) - } else { - Ok(()) - } -} diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index e169f667..5fb510fd 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -33,7 +33,7 @@ use tracing::level_filters::LevelFilter; use uuid::Uuid; mod attach; -mod build; +pub(crate) mod build; mod check; mod formatting; mod graph; diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index ca29d9b5..f996a758 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -44,3 +44,5 @@ sysinfo = "0.30.11" crossbeam = "0.8.4" crossbeam-skiplist = "0.1.3" zenoh = "1.1.1" +git2 = "0.18.0" +url = "2.5.4" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index de531fd0..827e670a 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -21,6 +21,7 @@ use dora_message::{ }, daemon_to_daemon::InterDaemonEvent, daemon_to_node::{DaemonReply, NodeConfig, NodeDropEvent, NodeEvent}, + descriptor::NodeSource, metadata::{self, ArrowTypeInfo}, node_to_daemon::{DynamicNodeEvent, Timestamped}, DataflowId, @@ -34,6 +35,7 @@ use log::{DaemonLogger, DataflowLogger, Logger}; use pending::PendingNodes; use shared_memory_server::ShmemConf; use socket_stream_utils::socket_stream_send; +use spawn::Spawner; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, net::SocketAddr, @@ -97,6 +99,8 @@ pub struct Daemon { remote_daemon_events_tx: Option>>>, logger: DaemonLogger, + + repos_in_use: BTreeMap>, } type DaemonRunResult = BTreeMap>>; @@ -286,6 +290,7 @@ impl Daemon { clock, zenoh_session, remote_daemon_events_tx, + repos_in_use: Default::default(), }; let dora_events = ReceiverStream::new(dora_events_rx); @@ -732,6 +737,16 @@ impl Daemon { } } + let mut spawner = Spawner { + dataflow_id, + working_dir, + daemon_tx: self.events_tx.clone(), + dataflow_descriptor, + clock: self.clock.clone(), + uv, + repos_in_use: &mut self.repos_in_use, + }; + // spawn nodes and set up subscriptions for node in nodes.into_values() { let mut logger = logger.reborrow().for_node(node.id.clone()); @@ -752,19 +767,10 @@ impl Daemon { logger .log(LogLevel::Info, Some("daemon".into()), "spawning") .await; - match spawn::spawn_node( - dataflow_id, - &working_dir, - node, - self.events_tx.clone(), - dataflow_descriptor.clone(), - self.clock.clone(), - node_stderr_most_recent, - uv, - &mut logger, - ) - .await - .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) + match spawner + .spawn_node(node, node_stderr_most_recent, &mut logger) + .await + .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { Ok(running_node) => { dataflow.running_nodes.insert(node_id, running_node); @@ -1310,6 +1316,10 @@ impl Daemon { .clone(), }; + self.repos_in_use.values_mut().for_each(|dataflows| { + dataflows.remove(&dataflow_id); + }); + logger .log( LogLevel::Info, @@ -2176,7 +2186,9 @@ impl CoreNodeKindExt for CoreNodeKind { fn dynamic(&self) -> bool { match self { CoreNodeKind::Runtime(_n) => false, - CoreNodeKind::Custom(n) => n.source == DYNAMIC_SOURCE, + CoreNodeKind::Custom(n) => { + matches!(&n.source, NodeSource::Local) && n.path == DYNAMIC_SOURCE + } } } } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index db7c7bbd..643ae989 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -7,6 +7,7 @@ use aligned_vec::{AVec, ConstAlign}; use crossbeam::queue::ArrayQueue; use dora_arrow_convert::IntoArrow; use dora_core::{ + build::run_build_command, config::DataId, descriptor::{ resolve_path, source_is_url, Descriptor, OperatorDefinition, OperatorSource, PythonSource, @@ -20,6 +21,7 @@ use dora_message::{ common::{LogLevel, LogMessage}, daemon_to_coordinator::{DataMessage, NodeExitStatus, Timestamped}, daemon_to_node::{NodeConfig, RuntimeConfig}, + descriptor::GitRepoRev, DataflowId, }; use dora_node_api::{ @@ -28,7 +30,9 @@ use dora_node_api::{ Metadata, }; use eyre::{ContextCompat, WrapErr}; +use git2::FetchOptions; use std::{ + collections::{BTreeMap, BTreeSet}, path::{Path, PathBuf}, process::Stdio, sync::Arc, @@ -39,533 +43,777 @@ use tokio::{ sync::{mpsc, oneshot}, }; use tracing::error; +use url::Url; + +pub struct Spawner<'a> { + pub dataflow_id: DataflowId, + pub working_dir: PathBuf, + pub daemon_tx: mpsc::Sender>, + pub dataflow_descriptor: Descriptor, + /// clock is required for generating timestamps when dropping messages early because queue is full + pub clock: Arc, + pub uv: bool, + pub repos_in_use: &'a mut BTreeMap>, +} -/// clock is required for generating timestamps when dropping messages early because queue is full -pub async fn spawn_node( - dataflow_id: DataflowId, - working_dir: &Path, - node: ResolvedNode, - daemon_tx: mpsc::Sender>, - dataflow_descriptor: Descriptor, - clock: Arc, - node_stderr_most_recent: Arc>, - uv: bool, - logger: &mut NodeLogger<'_>, -) -> eyre::Result { - let node_id = node.id.clone(); - logger - .log( - LogLevel::Debug, - Some("daemon::spawner".into()), - "spawning node", +impl Spawner<'_> { + pub async fn spawn_node( + &mut self, + node: ResolvedNode, + node_stderr_most_recent: Arc>, + logger: &mut NodeLogger<'_>, + ) -> eyre::Result { + let dataflow_id = self.dataflow_id; + let node_id = node.id.clone(); + logger + .log( + LogLevel::Debug, + Some("daemon::spawner".into()), + "spawning node", + ) + .await; + + let queue_sizes = node_inputs(&node) + .into_iter() + .map(|(k, v)| (k, v.queue_size.unwrap_or(10))) + .collect(); + let daemon_communication = spawn_listener_loop( + &dataflow_id, + &node_id, + &self.daemon_tx, + self.dataflow_descriptor.communication.local, + queue_sizes, + self.clock.clone(), ) - .await; + .await?; + let send_stdout_to = node + .send_stdout_as() + .context("Could not resolve `send_stdout_as` configuration")?; - let queue_sizes = node_inputs(&node) - .into_iter() - .map(|(k, v)| (k, v.queue_size.unwrap_or(10))) - .collect(); - let daemon_communication = spawn_listener_loop( - &dataflow_id, - &node_id, - &daemon_tx, - dataflow_descriptor.communication.local, - queue_sizes, - clock.clone(), - ) - .await?; - let send_stdout_to = node - .send_stdout_as() - .context("Could not resolve `send_stdout_as` configuration")?; - - let node_config = NodeConfig { - dataflow_id, - node_id: node_id.clone(), - run_config: node.kind.run_config(), - daemon_communication, - dataflow_descriptor, - dynamic: node.kind.dynamic(), - }; + let node_config = NodeConfig { + dataflow_id, + node_id: node_id.clone(), + run_config: node.kind.run_config(), + daemon_communication, + dataflow_descriptor: self.dataflow_descriptor.clone(), + dynamic: node.kind.dynamic(), + }; - let mut child = match node.kind { - dora_core::descriptor::CoreNodeKind::Custom(n) => { - let mut command = match n.source.as_str() { - DYNAMIC_SOURCE => { + let mut child = match node.kind { + dora_core::descriptor::CoreNodeKind::Custom(n) => { + let command = match &n.source { + dora_message::descriptor::NodeSource::Local => { + spawn_command_from_path(&self.working_dir, self.uv, logger, &n, true) + .await? + } + dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { + self.spawn_git_node(&n, repo, rev, logger).await? + } + }; + let Some(mut command) = command else { return Ok(RunningNode { pid: None, node_config, }); - } - SHELL_SOURCE => { - if cfg!(target_os = "windows") { - let mut cmd = tokio::process::Command::new("cmd"); - cmd.args(["/C", &n.args.clone().unwrap_or_default()]); - cmd - } else { - let mut cmd = tokio::process::Command::new("sh"); - cmd.args(["-c", &n.args.clone().unwrap_or_default()]); - cmd - } - } - source => { - let resolved_path = if source_is_url(source) { - // try to download the shared library - let target_dir = Path::new("build"); - download_file(source, target_dir) - .await - .wrap_err("failed to download custom node")? - } else { - resolve_path(source, working_dir).wrap_err_with(|| { - format!("failed to resolve node source `{}`", source) - })? - }; + }; - // If extension is .py, use python to run the script - let mut cmd = match resolved_path.extension().map(|ext| ext.to_str()) { - Some(Some("py")) => { - let mut cmd = if uv { - let mut cmd = tokio::process::Command::new("uv"); - cmd.arg("run"); - cmd.arg("python"); - logger - .log( - LogLevel::Info, - Some("spawner".into()), - format!( - "spawning: uv run python -u {}", - resolved_path.display() - ), - ) - .await; - cmd - } else { - let python = get_python_path().wrap_err( - "Could not find python path when spawning custom node", - )?; - logger - .log( - LogLevel::Info, - Some("spawner".into()), - format!( - "spawning: {:?} -u {}", - &python, - resolved_path.display() - ), - ) - .await; - - tokio::process::Command::new(python) - }; - // Force python to always flush stdout/stderr buffer - cmd.arg("-u"); - cmd.arg(&resolved_path); - cmd - } - _ => { - logger - .log( - LogLevel::Info, - Some("spawner".into()), - format!("spawning: {}", resolved_path.display()), - ) - .await; - if uv { - let mut cmd = tokio::process::Command::new("uv"); - cmd.arg("run"); - cmd.arg(&resolved_path); - cmd - } else { - tokio::process::Command::new(&resolved_path) - } - } - }; + command.current_dir(&self.working_dir); + command.stdin(Stdio::null()); - if let Some(args) = &n.args { - cmd.args(args.split_ascii_whitespace()); + command.env( + "DORA_NODE_CONFIG", + serde_yaml::to_string(&node_config.clone()) + .wrap_err("failed to serialize node config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = node.env { + for (key, value) in envs { + command.env(key, value.to_string()); } - cmd - } - }; - - command.current_dir(working_dir); - command.stdin(Stdio::null()); - - command.env( - "DORA_NODE_CONFIG", - serde_yaml::to_string(&node_config.clone()) - .wrap_err("failed to serialize node config")?, - ); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.env { - for (key, value) in envs { - command.env(key, value.to_string()); } - } - if let Some(envs) = n.envs { - // node has some inner env variables -> add them too - for (key, value) in envs { - command.env(key, value.to_string()); + if let Some(envs) = n.envs { + // node has some inner env variables -> add them too + for (key, value) in envs { + command.env(key, value.to_string()); + } } - } - // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C - #[cfg(unix)] - command.process_group(0); - - command.env("PYTHONUNBUFFERED", "1"); - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .wrap_err_with(move || { - format!( - "failed to run `{}` with args `{}`", - n.source, - n.args.as_deref().unwrap_or_default(), - ) - })? - } - dora_core::descriptor::CoreNodeKind::Runtime(n) => { - let python_operators: Vec<&OperatorDefinition> = n - .operators - .iter() - .filter(|x| matches!(x.config.source, OperatorSource::Python { .. })) - .collect(); - - let other_operators = n - .operators - .iter() - .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); - - let mut command = if !python_operators.is_empty() && !other_operators { - // Use python to spawn runtime if there is a python operator - - // TODO: Handle multi-operator runtime once sub-interpreter is supported - if python_operators.len() > 2 { - eyre::bail!( - "Runtime currently only support one Python Operator. + // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C + #[cfg(unix)] + command.process_group(0); + + command.env("PYTHONUNBUFFERED", "1"); + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .wrap_err_with(move || { + format!( + "failed to run `{}` with args `{}`", + n.path, + n.args.as_deref().unwrap_or_default(), + ) + })? + } + dora_core::descriptor::CoreNodeKind::Runtime(n) => { + let python_operators: Vec<&OperatorDefinition> = n + .operators + .iter() + .filter(|x| matches!(x.config.source, OperatorSource::Python { .. })) + .collect(); + + let other_operators = n + .operators + .iter() + .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); + + let mut command = if !python_operators.is_empty() && !other_operators { + // Use python to spawn runtime if there is a python operator + + // TODO: Handle multi-operator runtime once sub-interpreter is supported + if python_operators.len() > 2 { + eyre::bail!( + "Runtime currently only support one Python Operator. This is because pyo4 sub-interpreter is not yet available. See: https://github.com/PyO4/pyo3/issues/576" - ); - } + ); + } - let python_operator = python_operators - .first() - .context("Runtime had no operators definition.")?; + let python_operator = python_operators + .first() + .context("Runtime had no operators definition.")?; - if let OperatorSource::Python(PythonSource { - source: _, - conda_env: Some(conda_env), - }) = &python_operator.config.source - { - let conda = which::which("conda").context( + if let OperatorSource::Python(PythonSource { + source: _, + conda_env: Some(conda_env), + }) = &python_operator.config.source + { + let conda = which::which("conda").context( "failed to find `conda`, yet a `conda_env` was defined. Make sure that `conda` is available.", )?; - let mut command = tokio::process::Command::new(conda); - command.args([ - "run", - "-n", - conda_env, - "python", - "-c", - format!("import dora; dora.start_runtime() # {}", node.id).as_str(), - ]); - command - } else { - let mut cmd = if uv { - let mut cmd = tokio::process::Command::new("uv"); - cmd.arg("run"); - cmd.arg("python"); - tracing::info!( + let mut command = tokio::process::Command::new(conda); + command.args([ + "run", + "-n", + conda_env, + "python", + "-c", + format!("import dora; dora.start_runtime() # {}", node.id).as_str(), + ]); + command + } else { + let mut cmd = if self.uv { + let mut cmd = tokio::process::Command::new("uv"); + cmd.arg("run"); + cmd.arg("python"); + tracing::info!( "spawning: uv run python -uc import dora; dora.start_runtime() # {}", node.id ); + cmd + } else { + let python = get_python_path() + .wrap_err("Could not find python path when spawning custom node")?; + tracing::info!( + "spawning: python -uc import dora; dora.start_runtime() # {}", + node.id + ); + + tokio::process::Command::new(python) + }; + // Force python to always flush stdout/stderr buffer + cmd.args([ + "-c", + format!("import dora; dora.start_runtime() # {}", node.id).as_str(), + ]); cmd - } else { - let python = get_python_path() - .wrap_err("Could not find python path when spawning custom node")?; - tracing::info!( - "spawning: python -uc import dora; dora.start_runtime() # {}", - node.id - ); - - tokio::process::Command::new(python) - }; - // Force python to always flush stdout/stderr buffer - cmd.args([ - "-c", - format!("import dora; dora.start_runtime() # {}", node.id).as_str(), - ]); + } + } else if python_operators.is_empty() && other_operators { + let mut cmd = tokio::process::Command::new( + std::env::current_exe() + .wrap_err("failed to get current executable path")?, + ); + cmd.arg("runtime"); cmd - } - } else if python_operators.is_empty() && other_operators { - let mut cmd = tokio::process::Command::new( - std::env::current_exe().wrap_err("failed to get current executable path")?, - ); - cmd.arg("runtime"); - cmd - } else { - eyre::bail!("Runtime can not mix Python Operator with other type of operator."); - }; - command.current_dir(working_dir); + } else { + eyre::bail!("Runtime can not mix Python Operator with other type of operator."); + }; + command.current_dir(&self.working_dir); - let runtime_config = RuntimeConfig { - node: node_config.clone(), - operators: n.operators, - }; - command.env( - "DORA_RUNTIME_CONFIG", - serde_yaml::to_string(&runtime_config) - .wrap_err("failed to serialize runtime config")?, - ); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.env { - for (key, value) in envs { - command.env(key, value.to_string()); + let runtime_config = RuntimeConfig { + node: node_config.clone(), + operators: n.operators, + }; + command.env( + "DORA_RUNTIME_CONFIG", + serde_yaml::to_string(&runtime_config) + .wrap_err("failed to serialize runtime config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = node.env { + for (key, value) in envs { + command.env(key, value.to_string()); + } } + // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C + #[cfg(unix)] + command.process_group(0); + + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .wrap_err(format!( + "failed to run runtime {}/{}", + runtime_config.node.dataflow_id, runtime_config.node.node_id + ))? } - // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C - #[cfg(unix)] - command.process_group(0); - - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .wrap_err(format!( - "failed to run runtime {}/{}", - runtime_config.node.dataflow_id, runtime_config.node.node_id - ))? - } - }; + }; - let pid = crate::ProcessId::new(child.id().context( - "Could not get the pid for the just spawned node and indicate that there is an error", - )?); - logger - .log( - LogLevel::Debug, - Some("spawner".into()), - format!("spawned node with pid {pid:?}"), - ) - .await; + let pid = crate::ProcessId::new(child.id().context( + "Could not get the pid for the just spawned node and indicate that there is an error", + )?); + logger + .log( + LogLevel::Debug, + Some("spawner".into()), + format!("spawned node with pid {pid:?}"), + ) + .await; + + let dataflow_dir: PathBuf = self.working_dir.join("out").join(dataflow_id.to_string()); + if !dataflow_dir.exists() { + std::fs::create_dir_all(&dataflow_dir).context("could not create dataflow_dir")?; + } + let (tx, mut rx) = mpsc::channel(10); + let mut file = File::create(log::log_path(&self.working_dir, &dataflow_id, &node.id)) + .await + .expect("Failed to create log file"); + let mut child_stdout = + tokio::io::BufReader::new(child.stdout.take().expect("failed to take stdout")); + let running_node = RunningNode { + pid: Some(pid), + node_config, + }; + let stdout_tx = tx.clone(); + let node_id = node.id.clone(); + // Stdout listener stream + tokio::spawn(async move { + let mut buffer = String::new(); + let mut finished = false; + while !finished { + let mut raw = Vec::new(); + finished = match child_stdout + .read_until(b'\n', &mut raw) + .await + .wrap_err_with(|| { + format!("failed to read stdout line from spawned node {node_id}") + }) { + Ok(0) => true, + Ok(_) => false, + Err(err) => { + tracing::warn!("{err:?}"); + false + } + }; - let dataflow_dir: PathBuf = working_dir.join("out").join(dataflow_id.to_string()); - if !dataflow_dir.exists() { - std::fs::create_dir_all(&dataflow_dir).context("could not create dataflow_dir")?; - } - let (tx, mut rx) = mpsc::channel(10); - let mut file = File::create(log::log_path(working_dir, &dataflow_id, &node_id)) - .await - .expect("Failed to create log file"); - let mut child_stdout = - tokio::io::BufReader::new(child.stdout.take().expect("failed to take stdout")); - let running_node = RunningNode { - pid: Some(pid), - node_config, - }; - let stdout_tx = tx.clone(); - let node_id = node.id.clone(); - // Stdout listener stream - tokio::spawn(async move { - let mut buffer = String::new(); - let mut finished = false; - while !finished { - let mut raw = Vec::new(); - finished = match child_stdout - .read_until(b'\n', &mut raw) - .await - .wrap_err_with(|| format!("failed to read stdout line from spawned node {node_id}")) - { - Ok(0) => true, - Ok(_) => false, - Err(err) => { - tracing::warn!("{err:?}"); - false - } - }; + match String::from_utf8(raw) { + Ok(s) => buffer.push_str(&s), + Err(err) => { + let lossy = String::from_utf8_lossy(err.as_bytes()); + tracing::warn!( + "stdout not valid UTF-8 string (node {node_id}): {}: {lossy}", + err.utf8_error() + ); + buffer.push_str(&lossy) + } + }; - match String::from_utf8(raw) { - Ok(s) => buffer.push_str(&s), - Err(err) => { - let lossy = String::from_utf8_lossy(err.as_bytes()); - tracing::warn!( - "stdout not valid UTF-8 string (node {node_id}): {}: {lossy}", - err.utf8_error() - ); - buffer.push_str(&lossy) + if buffer.contains("TRACE") + || buffer.contains("INFO") + || buffer.contains("DEBUG") + || buffer.contains("WARN") + || buffer.contains("ERROR") + { + // tracing output, potentially multi-line -> keep reading following lines + // until double-newline + if !buffer.ends_with("\n\n") && !finished { + continue; + } } - }; - if buffer.contains("TRACE") - || buffer.contains("INFO") - || buffer.contains("DEBUG") - || buffer.contains("WARN") - || buffer.contains("ERROR") - { - // tracing output, potentially multi-line -> keep reading following lines - // until double-newline - if !buffer.ends_with("\n\n") && !finished { - continue; + // send the buffered lines + let lines = std::mem::take(&mut buffer); + let sent = stdout_tx.send(lines.clone()).await; + if sent.is_err() { + println!("Could not log: {lines}"); } } + }); + + let mut child_stderr = + tokio::io::BufReader::new(child.stderr.take().expect("failed to take stderr")); + + // Stderr listener stream + let stderr_tx = tx.clone(); + let node_id = node.id.clone(); + let uhlc = self.clock.clone(); + let daemon_tx_log = self.daemon_tx.clone(); + tokio::spawn(async move { + let mut buffer = String::new(); + let mut finished = false; + while !finished { + let mut raw = Vec::new(); + finished = match child_stderr + .read_until(b'\n', &mut raw) + .await + .wrap_err_with(|| { + format!("failed to read stderr line from spawned node {node_id}") + }) { + Ok(0) => true, + Ok(_) => false, + Err(err) => { + tracing::warn!("{err:?}"); + true + } + }; - // send the buffered lines - let lines = std::mem::take(&mut buffer); - let sent = stdout_tx.send(lines.clone()).await; - if sent.is_err() { - println!("Could not log: {lines}"); - } - } - }); + let new = match String::from_utf8(raw) { + Ok(s) => s, + Err(err) => { + let lossy = String::from_utf8_lossy(err.as_bytes()); + tracing::warn!( + "stderr not valid UTF-8 string (node {node_id}): {}: {lossy}", + err.utf8_error() + ); + lossy.into_owned() + } + }; + + buffer.push_str(&new); - let mut child_stderr = - tokio::io::BufReader::new(child.stderr.take().expect("failed to take stderr")); - - // Stderr listener stream - let stderr_tx = tx.clone(); - let node_id = node.id.clone(); - let uhlc = clock.clone(); - let daemon_tx_log = daemon_tx.clone(); - tokio::spawn(async move { - let mut buffer = String::new(); - let mut finished = false; - while !finished { - let mut raw = Vec::new(); - finished = match child_stderr - .read_until(b'\n', &mut raw) - .await - .wrap_err_with(|| format!("failed to read stderr line from spawned node {node_id}")) - { - Ok(0) => true, - Ok(_) => false, - Err(err) => { - tracing::warn!("{err:?}"); - true + node_stderr_most_recent.force_push(new); + + // send the buffered lines + let lines = std::mem::take(&mut buffer); + let sent = stderr_tx.send(lines.clone()).await; + if sent.is_err() { + println!("Could not log: {lines}"); } + } + }); + + let node_id = node.id.clone(); + let (log_finish_tx, log_finish_rx) = oneshot::channel(); + let clock = self.clock.clone(); + let daemon_tx = self.daemon_tx.clone(); + tokio::spawn(async move { + let exit_status = NodeExitStatus::from(child.wait().await); + let _ = log_finish_rx.await; + let event = DoraEvent::SpawnedNodeResult { + dataflow_id, + node_id, + exit_status, + } + .into(); + let event = Timestamped { + inner: event, + timestamp: clock.new_timestamp(), }; - - let new = match String::from_utf8(raw) { - Ok(s) => s, - Err(err) => { - let lossy = String::from_utf8_lossy(err.as_bytes()); - tracing::warn!( - "stderr not valid UTF-8 string (node {node_id}): {}: {lossy}", - err.utf8_error() + let _ = daemon_tx.send(event).await; + }); + + let node_id = node.id.clone(); + let daemon_id = logger.inner().inner().daemon_id().clone(); + let mut cloned_logger = logger + .inner() + .inner() + .inner() + .try_clone() + .await + .context("failed to clone logger")?; + // Log to file stream. + tokio::spawn(async move { + while let Some(message) = rx.recv().await { + // If log is an output, we're sending the logs to the dataflow + if let Some(stdout_output_name) = &send_stdout_to { + // Convert logs to DataMessage + let array = message.into_arrow(); + + let array: ArrayData = array.into(); + let total_len = required_data_size(&array); + let mut sample: AVec> = + AVec::__from_elem(128, 0, total_len); + + let type_info = copy_array_into_sample(&mut sample, &array); + + let metadata = Metadata::new(uhlc.new_timestamp(), type_info); + let output_id = OutputId( + node_id.clone(), + DataId::from(stdout_output_name.to_string()), ); - lossy.into_owned() + let event = DoraEvent::Logs { + dataflow_id, + output_id, + metadata, + message: DataMessage::Vec(sample), + } + .into(); + let event = Timestamped { + inner: event, + timestamp: uhlc.new_timestamp(), + }; + let _ = daemon_tx_log.send(event).await; + } + + let _ = file + .write_all(message.as_bytes()) + .await + .map_err(|err| error!("Could not log {message} to file due to {err}")); + let formatted = message.lines().fold(String::default(), |mut output, line| { + output.push_str(line); + output + }); + if std::env::var("DORA_QUIET").is_err() { + cloned_logger + .log(LogMessage { + daemon_id: Some(daemon_id.clone()), + dataflow_id, + level: LogLevel::Info, + node_id: Some(node_id.clone()), + target: Some("stdout".into()), + message: formatted, + file: None, + line: None, + module_path: None, + }) + .await; } + // Make sure that all data has been synced to disk. + let _ = file + .sync_all() + .await + .map_err(|err| error!("Could not sync logs to file due to {err}")); + } + let _ = log_finish_tx + .send(()) + .map_err(|_| error!("Could not inform that log file thread finished")); + }); + Ok(running_node) + } + + async fn spawn_git_node( + &mut self, + node: &dora_core::descriptor::CustomNode, + repo_addr: &String, + rev: &Option, + logger: &mut NodeLogger<'_>, + ) -> Result, eyre::Error> { + let dataflow_id = self.dataflow_id; + let repo_url = Url::parse(repo_addr).context("failed to parse git repository URL")?; + let target_dir = self.working_dir.join("build"); + let rev_str = rev_str(rev); + let refname = rev.clone().map(|rev| match rev { + GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), + GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), + GitRepoRev::Rev(rev) => rev, + }); + let clone_dir_base = { + let base = { + let mut path = + target_dir.join(repo_url.host_str().context("git URL has no hostname")?); + + path.extend(repo_url.path_segments().context("no path in git URL")?); + path }; + match rev { + None => base, + Some(rev) => match rev { + GitRepoRev::Branch(branch) => base.join("branch").join(branch), + GitRepoRev::Tag(tag) => base.join("tag").join(tag), + GitRepoRev::Rev(rev) => base.join("rev").join(rev), + }, + } + }; + let clone_dir = if clone_dir_base.exists() { + let used_by_other_dataflow = self.used_by_other_dataflow(dataflow_id, &clone_dir_base); + if used_by_other_dataflow { + // don't reuse, choose new directory + // (TODO reuse if still up to date) + + let dir_name = clone_dir_base.file_name().unwrap().to_str().unwrap(); + let mut i = 1; + loop { + let new_path = clone_dir_base.with_file_name(format!("{dir_name}-{i}")); + if new_path.exists() && self.used_by_other_dataflow(dataflow_id, &new_path) { + i += 1; + } else { + break new_path; + } + } + } else { + clone_dir_base + } + } else { + clone_dir_base + }; + if clone_dir.exists() { + let empty = BTreeSet::new(); + let in_use = self.repos_in_use.get(&clone_dir).unwrap_or(&empty); + let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); + if used_by_other_dataflow { + // TODO allow if still up to date + eyre::bail!("clone_dir is already in use by other dataflow") + } else { + self.repos_in_use + .entry(clone_dir.clone()) + .or_default() + .insert(dataflow_id); + logger + .log( + LogLevel::Info, + None, + format!("reusing {repo_addr}{rev_str}"), + ) + .await; + let refname_cloned = refname.clone(); + let clone_dir = clone_dir.clone(); + let repository = fetch_changes(clone_dir, refname_cloned).await?; + checkout_tree(&repository, refname)?; + } + } else { + self.repos_in_use + .entry(clone_dir.clone()) + .or_default() + .insert(dataflow_id); + let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; + checkout_tree(&repository, refname)?; + }; + if let Some(build) = &node.build { + logger + .log( + LogLevel::Info, + None, + format!("running build command: `{build}"), + ) + .await; + let build = build.to_owned(); + let clone_dir = clone_dir.clone(); + let uv = self.uv; + let task = tokio::task::spawn_blocking(move || { + run_build_command(&build, &clone_dir, uv).context("build command failed") + }); + task.await??; + } + spawn_command_from_path(&clone_dir, self.uv, logger, node, true).await + } - buffer.push_str(&new); + fn used_by_other_dataflow( + &mut self, + dataflow_id: uuid::Uuid, + clone_dir_base: &PathBuf, + ) -> bool { + let empty = BTreeSet::new(); + let in_use = self.repos_in_use.get(clone_dir_base).unwrap_or(&empty); + let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); + used_by_other_dataflow + } +} - node_stderr_most_recent.force_push(new); +fn rev_str(rev: &Option) -> String { + match rev { + Some(GitRepoRev::Branch(branch)) => format!(" (branch {branch})"), + Some(GitRepoRev::Tag(tag)) => format!(" (tag {tag})"), + Some(GitRepoRev::Rev(rev)) => format!(" (rev {rev})"), + None => String::new(), + } +} - // send the buffered lines - let lines = std::mem::take(&mut buffer); - let sent = stderr_tx.send(lines.clone()).await; - if sent.is_err() { - println!("Could not log: {lines}"); - } +async fn clone_into( + repo_addr: &String, + rev: &Option, + clone_dir: &Path, + logger: &mut NodeLogger<'_>, +) -> eyre::Result { + let rev_str = rev_str(rev); + logger + .log( + LogLevel::Info, + None, + format!("cloning {repo_addr}{rev_str} into {}", clone_dir.display()), + ) + .await; + let rev: Option = rev.clone(); + let clone_into = clone_dir.to_owned(); + let repo_addr = repo_addr.clone(); + let task = tokio::task::spawn_blocking(move || { + let mut builder = git2::build::RepoBuilder::new(); + let mut fetch_options = git2::FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + builder.fetch_options(fetch_options); + if let Some(GitRepoRev::Branch(branch)) = &rev { + builder.branch(branch); } + builder + .clone(&repo_addr, &clone_into) + .context("failed to clone repo") }); + let repo = task.await??; + Ok(repo) +} - let node_id = node.id.clone(); - let (log_finish_tx, log_finish_rx) = oneshot::channel(); - tokio::spawn(async move { - let exit_status = NodeExitStatus::from(child.wait().await); - let _ = log_finish_rx.await; - let event = DoraEvent::SpawnedNodeResult { - dataflow_id, - node_id, - exit_status, +async fn fetch_changes( + repo_dir: PathBuf, + refname: Option, +) -> Result { + let fetch_changes = tokio::task::spawn_blocking(move || { + let repository = git2::Repository::open(&repo_dir).context("failed to open git repo")?; + + { + let mut remote = repository + .find_remote("origin") + .context("failed to find remote `origin` in repo")?; + remote + .connect(git2::Direction::Fetch) + .context("failed to connect to remote")?; + let default_branch = remote + .default_branch() + .context("failed to get default branch for remote")?; + let fetch = match &refname { + Some(refname) => refname, + None => default_branch + .as_str() + .context("failed to read default branch as string")?, + }; + let mut fetch_options = FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + remote + .fetch(&[&fetch], Some(&mut fetch_options), None) + .context("failed to fetch from git repo")?; } - .into(); - let event = Timestamped { - inner: event, - timestamp: clock.new_timestamp(), - }; - let _ = daemon_tx.send(event).await; + Result::<_, eyre::Error>::Ok(repository) }); + let repository = fetch_changes.await??; + Ok(repository) +} - let node_id = node.id.clone(); - let daemon_id = logger.inner().inner().daemon_id().clone(); - let mut cloned_logger = logger - .inner() - .inner() - .inner() - .try_clone() - .await - .context("failed to clone logger")?; - // Log to file stream. - tokio::spawn(async move { - while let Some(message) = rx.recv().await { - // If log is an output, we're sending the logs to the dataflow - if let Some(stdout_output_name) = &send_stdout_to { - // Convert logs to DataMessage - let array = message.into_arrow(); - - let array: ArrayData = array.into(); - let total_len = required_data_size(&array); - let mut sample: AVec> = AVec::__from_elem(128, 0, total_len); - - let type_info = copy_array_into_sample(&mut sample, &array); - - let metadata = Metadata::new(uhlc.new_timestamp(), type_info); - let output_id = OutputId( - node_id.clone(), - DataId::from(stdout_output_name.to_string()), - ); - let event = DoraEvent::Logs { - dataflow_id, - output_id, - metadata, - message: DataMessage::Vec(sample), - } - .into(); - let event = Timestamped { - inner: event, - timestamp: uhlc.new_timestamp(), - }; - let _ = daemon_tx_log.send(event).await; +fn checkout_tree(repository: &git2::Repository, refname: Option) -> eyre::Result<()> { + if let Some(refname) = refname { + let (object, reference) = repository + .revparse_ext(&refname) + .context("failed to parse ref")?; + repository + .checkout_tree(&object, None) + .context("failed to checkout ref")?; + match reference { + Some(reference) => repository + .set_head(reference.name().context("failed to get reference_name")?) + .context("failed to set head")?, + None => repository + .set_head_detached(object.id()) + .context("failed to set detached head")?, + } + } + Ok(()) +} + +async fn spawn_command_from_path( + working_dir: &Path, + uv: bool, + logger: &mut NodeLogger<'_>, + node: &dora_core::descriptor::CustomNode, + permit_url: bool, +) -> eyre::Result> { + let cmd = match node.path.as_str() { + DYNAMIC_SOURCE => return Ok(None), + SHELL_SOURCE => { + if cfg!(target_os = "windows") { + let mut cmd = tokio::process::Command::new("cmd"); + cmd.args(["/C", &node.args.clone().unwrap_or_default()]); + cmd + } else { + let mut cmd = tokio::process::Command::new("sh"); + cmd.args(["-c", &node.args.clone().unwrap_or_default()]); + cmd } + } + source => { + let resolved_path = if source_is_url(source) { + if !permit_url { + eyre::bail!("URL paths are not supported in this case"); + } + // try to download the shared library + let target_dir = Path::new("build"); + download_file(source, target_dir) + .await + .wrap_err("failed to download custom node")? + } else { + resolve_path(source, working_dir) + .wrap_err_with(|| format!("failed to resolve node source `{}`", source))? + }; - let _ = file - .write_all(message.as_bytes()) - .await - .map_err(|err| error!("Could not log {message} to file due to {err}")); - let formatted = message.lines().fold(String::default(), |mut output, line| { - output.push_str(line); - output - }); - if std::env::var("DORA_QUIET").is_err() { - cloned_logger - .log(LogMessage { - daemon_id: Some(daemon_id.clone()), - dataflow_id, - level: LogLevel::Info, - node_id: Some(node_id.clone()), - target: Some("stdout".into()), - message: formatted, - file: None, - line: None, - module_path: None, - }) - .await; + // If extension is .py, use python to run the script + let mut cmd = match resolved_path.extension().map(|ext| ext.to_str()) { + Some(Some("py")) => { + let mut cmd = if uv { + let mut cmd = tokio::process::Command::new("uv"); + cmd.arg("run"); + cmd.arg("python"); + logger + .log( + LogLevel::Info, + Some("spawner".into()), + format!("spawning: uv run python -u {}", resolved_path.display()), + ) + .await; + cmd + } else { + let python = get_python_path() + .wrap_err("Could not find python path when spawning custom node")?; + logger + .log( + LogLevel::Info, + Some("spawner".into()), + format!("spawning: {:?} -u {}", &python, resolved_path.display()), + ) + .await; + + tokio::process::Command::new(python) + }; + // Force python to always flush stdout/stderr buffer + cmd.arg("-u"); + cmd.arg(&resolved_path); + cmd + } + _ => { + logger + .log( + LogLevel::Info, + Some("spawner".into()), + format!("spawning: {}", resolved_path.display()), + ) + .await; + if uv { + let mut cmd = tokio::process::Command::new("uv"); + cmd.arg("run"); + cmd.arg(&resolved_path); + cmd + } else { + tokio::process::Command::new(&resolved_path) + } + } + }; + + if let Some(args) = &node.args { + cmd.args(args.split_ascii_whitespace()); } - // Make sure that all data has been synced to disk. - let _ = file - .sync_all() - .await - .map_err(|err| error!("Could not sync logs to file due to {err}")); + cmd } - let _ = log_finish_tx - .send(()) - .map_err(|_| error!("Could not inform that log file thread finished")); - }); - Ok(running_node) + }; + + Ok(Some(cmd)) } diff --git a/libraries/core/src/build.rs b/libraries/core/src/build.rs new file mode 100644 index 00000000..e5bd3bcd --- /dev/null +++ b/libraries/core/src/build.rs @@ -0,0 +1,30 @@ +use std::{path::Path, process::Command}; + +use eyre::{eyre, Context}; + +pub fn run_build_command(build: &str, working_dir: &Path, uv: bool) -> eyre::Result<()> { + let lines = build.lines().collect::>(); + for build_line in lines { + let mut split = build_line.split_whitespace(); + + let program = split + .next() + .ok_or_else(|| eyre!("build command is empty"))?; + let mut cmd = if uv && (program == "pip" || program == "pip3") { + let mut cmd = Command::new("uv"); + cmd.arg("pip"); + cmd + } else { + Command::new(program) + }; + cmd.args(split); + cmd.current_dir(working_dir); + let exit_status = cmd + .status() + .wrap_err_with(|| format!("failed to run `{}`", build))?; + if !exit_status.success() { + return Err(eyre!("build command `{build_line}` returned {exit_status}")); + } + } + Ok(()) +} diff --git a/libraries/core/src/descriptor/mod.rs b/libraries/core/src/descriptor/mod.rs index cb8860fa..c3cd910a 100644 --- a/libraries/core/src/descriptor/mod.rs +++ b/libraries/core/src/descriptor/mod.rs @@ -1,5 +1,6 @@ use dora_message::{ config::{Input, InputMapping, NodeRunConfig}, + descriptor::{GitRepoRev, NodeSource}, id::{DataId, NodeId, OperatorId}, }; use eyre::{bail, Context, OptionExt, Result}; @@ -53,7 +54,7 @@ impl DescriptorExt for Descriptor { // adjust input mappings let mut node_kind = node_kind_mut(&mut node)?; let input_mappings: Vec<_> = match &mut node_kind { - NodeKindMut::Standard { path: _, inputs } => inputs.values_mut().collect(), + NodeKindMut::Standard { inputs, .. } => inputs.values_mut().collect(), NodeKindMut::Runtime(node) => node .operators .iter_mut() @@ -76,8 +77,13 @@ impl DescriptorExt for Descriptor { // resolve nodes let kind = match node_kind { - NodeKindMut::Standard { path, inputs: _ } => CoreNodeKind::Custom(CustomNode { - source: path.clone(), + NodeKindMut::Standard { + path, + source, + inputs: _, + } => CoreNodeKind::Custom(CustomNode { + path: path.clone(), + source, args: node.args, build: node.build, send_stdout_as: node.send_stdout_as, @@ -149,14 +155,35 @@ pub async fn read_as_descriptor(path: &Path) -> eyre::Result { fn node_kind_mut(node: &mut Node) -> eyre::Result { match node.kind()? { - NodeKind::Standard(_) => node - .path - .as_ref() - .map(|path| NodeKindMut::Standard { - path, + NodeKind::Standard(_) => { + let source = match (&node.git, &node.branch, &node.tag, &node.rev) { + (None, None, None, None) => NodeSource::Local, + (Some(repo), branch, tag, rev) => { + let rev = match (branch, tag, rev) { + (None, None, None) => None, + (Some(branch), None, None) => Some(GitRepoRev::Branch(branch.clone())), + (None, Some(tag), None) => Some(GitRepoRev::Tag(tag.clone())), + (None, None, Some(rev)) => Some(GitRepoRev::Rev(rev.clone())), + other @ (_, _, _) => { + eyre::bail!("only one of `branch`, `tag`, and `rev` are allowed (got {other:?})") + } + }; + NodeSource::GitBranch { + repo: repo.clone(), + rev, + } + } + (None, _, _, _) => { + eyre::bail!("`git` source required when using branch, tag, or rev") + } + }; + + Ok(NodeKindMut::Standard { + path: node.path.as_ref().ok_or_eyre("missing `path` attribute")?, + source, inputs: &mut node.inputs, }) - .ok_or_eyre("no path"), + } NodeKind::Runtime(_) => node .operators .as_mut() @@ -249,6 +276,7 @@ pub enum NodeKind<'a> { enum NodeKindMut<'a> { Standard { path: &'a String, + source: NodeSource, inputs: &'a mut BTreeMap, }, /// Dora runtime node diff --git a/libraries/core/src/descriptor/validate.rs b/libraries/core/src/descriptor/validate.rs index c28bd451..526fedff 100644 --- a/libraries/core/src/descriptor/validate.rs +++ b/libraries/core/src/descriptor/validate.rs @@ -28,23 +28,30 @@ pub fn check_dataflow( // check that nodes and operators exist for node in nodes.values() { match &node.kind { - descriptor::CoreNodeKind::Custom(custom) => match custom.source.as_str() { - SHELL_SOURCE => (), - DYNAMIC_SOURCE => (), - source => { - if source_is_url(source) { - info!("{source} is a URL."); // TODO: Implement url check. - } else if let Some(remote_daemon_id) = remote_daemon_id { - if let Some(machine) = &node.deploy.machine { - if remote_daemon_id.contains(&machine.as_str()) || coordinator_is_remote - { - info!("skipping path check for remote node `{}`", node.id); + descriptor::CoreNodeKind::Custom(custom) => match &custom.source { + dora_message::descriptor::NodeSource::Local => match custom.path.as_str() { + SHELL_SOURCE => (), + DYNAMIC_SOURCE => (), + source => { + if source_is_url(source) { + info!("{source} is a URL."); // TODO: Implement url check. + } else if let Some(remote_daemon_id) = remote_daemon_id { + if let Some(machine) = &node.deploy.machine { + if remote_daemon_id.contains(&machine.as_str()) + || coordinator_is_remote + { + info!("skipping path check for remote node `{}`", node.id); + } } - } - } else { - resolve_path(source, working_dir) - .wrap_err_with(|| format!("Could not find source path `{}`", source))?; - }; + } else { + resolve_path(source, working_dir).wrap_err_with(|| { + format!("Could not find source path `{}`", source) + })?; + }; + } + }, + dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { + // TODO: implement git repo check } }, descriptor::CoreNodeKind::Runtime(node) => { diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index c7e7cd6c..90f2c564 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -7,6 +7,7 @@ use std::{ pub use dora_message::{config, uhlc}; +pub mod build; pub mod descriptor; pub mod metadata; pub mod topics; diff --git a/libraries/message/src/descriptor.rs b/libraries/message/src/descriptor.rs index 2fe68760..02f660d4 100644 --- a/libraries/message/src/descriptor.rs +++ b/libraries/message/src/descriptor.rs @@ -70,6 +70,15 @@ pub struct Node { #[serde(default, skip_serializing_if = "Option::is_none")] pub path: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub git: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub branch: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub tag: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rev: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] pub args: Option, #[serde(default, skip_serializing_if = "Option::is_none")] @@ -216,7 +225,8 @@ pub struct CustomNode { /// args: some_node.py /// /// Source can match any executable in PATH. - pub source: String, + pub path: String, + pub source: NodeSource, /// Args for the executable. #[serde(default, skip_serializing_if = "Option::is_none")] pub args: Option, @@ -234,6 +244,22 @@ pub struct CustomNode { pub run_config: NodeRunConfig, } +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum NodeSource { + Local, + GitBranch { + repo: String, + rev: Option, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum GitRepoRev { + Branch(String), + Tag(String), + Rev(String), +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(untagged)] pub enum EnvValue { From e31b2a34898ec71f1196519759f06e2ac274c815 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 2 Apr 2025 15:40:08 +0200 Subject: [PATCH 002/101] Remove needless cloning of node `env` field for building --- binaries/cli/src/build.rs | 30 ++++++++++++++---------------- binaries/daemon/src/spawn.rs | 2 +- libraries/core/src/build.rs | 4 ++-- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/binaries/cli/src/build.rs b/binaries/cli/src/build.rs index 3a1bb281..7783615b 100644 --- a/binaries/cli/src/build.rs +++ b/binaries/cli/src/build.rs @@ -23,15 +23,15 @@ pub fn build(dataflow: String, uv: bool) -> eyre::Result<()> { match node.kind()? { dora_core::descriptor::NodeKind::Standard(_) => { if let Some(build) = &node.build { - run_build_command(build, working_dir, uv, node.env.clone()).with_context( - || format!("build command failed for standard node `{}`", node.id), - )? + run_build_command(build, working_dir, uv, &node.env).with_context(|| { + format!("build command failed for standard node `{}`", node.id) + })? } } dora_core::descriptor::NodeKind::Runtime(runtime_node) => { for operator in &runtime_node.operators { if let Some(build) = &operator.config.build { - run_build_command(build, working_dir, uv, node.env.clone()).with_context( + run_build_command(build, working_dir, uv, &node.env).with_context( || { format!( "build command failed for operator `{}/{}`", @@ -44,22 +44,20 @@ pub fn build(dataflow: String, uv: bool) -> eyre::Result<()> { } dora_core::descriptor::NodeKind::Custom(custom_node) => { if let Some(build) = &custom_node.build { - run_build_command(build, working_dir, uv, node.env.clone()).with_context( - || format!("build command failed for custom node `{}`", node.id), - )? + run_build_command(build, working_dir, uv, &node.env).with_context(|| { + format!("build command failed for custom node `{}`", node.id) + })? } } dora_core::descriptor::NodeKind::Operator(operator) => { if let Some(build) = &operator.config.build { - run_build_command(build, working_dir, uv, node.env.clone()).with_context( - || { - format!( - "build command failed for operator `{}/{}`", - node.id, - operator.id.as_ref().unwrap_or(&default_op_id) - ) - }, - )? + run_build_command(build, working_dir, uv, &node.env).with_context(|| { + format!( + "build command failed for operator `{}/{}`", + node.id, + operator.id.as_ref().unwrap_or(&default_op_id) + ) + })? } } } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 0c87449e..ec640952 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -608,7 +608,7 @@ impl Spawner<'_> { let uv = self.uv; let node_env = node_env.clone(); let task = tokio::task::spawn_blocking(move || { - run_build_command(&build, &clone_dir, uv, node_env).context("build command failed") + run_build_command(&build, &clone_dir, uv, &node_env).context("build command failed") }); task.await??; } diff --git a/libraries/core/src/build.rs b/libraries/core/src/build.rs index 4f8fba3d..2d900536 100644 --- a/libraries/core/src/build.rs +++ b/libraries/core/src/build.rs @@ -7,7 +7,7 @@ pub fn run_build_command( build: &str, working_dir: &Path, uv: bool, - envs: Option>, + envs: &Option>, ) -> eyre::Result<()> { let lines = build.lines().collect::>(); for build_line in lines { @@ -26,7 +26,7 @@ pub fn run_build_command( cmd.args(split); // Inject Environment Variables - if let Some(envs) = envs.clone() { + if let Some(envs) = envs { for (key, value) in envs { let value = value.to_string(); cmd.env(key, value); From 1eae50adb3b52e95ffd6f025cf7bccf6535bcdad Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 2 Apr 2025 15:57:14 +0200 Subject: [PATCH 003/101] Add a `rust-dataflow-git` example and run it on CI --- .github/workflows/ci.yml | 3 ++ Cargo.toml | 4 ++ examples/rust-dataflow-git/README.md | 7 ++++ examples/rust-dataflow-git/dataflow.yml | 29 ++++++++++++++ examples/rust-dataflow-git/run.rs | 52 +++++++++++++++++++++++++ 5 files changed, 95 insertions(+) create mode 100644 examples/rust-dataflow-git/README.md create mode 100644 examples/rust-dataflow-git/dataflow.yml create mode 100644 examples/rust-dataflow-git/run.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 928b81f6..296b9027 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,6 +118,9 @@ jobs: - name: "Rust Dataflow example" timeout-minutes: 30 run: cargo run --example rust-dataflow + - name: "Rust Git Dataflow example" + timeout-minutes: 30 + run: cargo run --example rust-dataflow-git - name: "Multiple Daemons example" timeout-minutes: 30 run: cargo run --example multiple-daemons diff --git a/Cargo.toml b/Cargo.toml index f67b0fcb..a353bf17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -132,6 +132,10 @@ path = "examples/vlm/run.rs" name = "rust-dataflow" path = "examples/rust-dataflow/run.rs" +[[example]] +name = "rust-dataflow-git" +path = "examples/rust-dataflow-git/run.rs" + [[example]] name = "rust-ros2-dataflow" path = "examples/rust-ros2-dataflow/run.rs" diff --git a/examples/rust-dataflow-git/README.md b/examples/rust-dataflow-git/README.md new file mode 100644 index 00000000..f4d2f3de --- /dev/null +++ b/examples/rust-dataflow-git/README.md @@ -0,0 +1,7 @@ +# Git-based Rust example + +To get started: + +```bash +cargo run --example rust-dataflow-git +``` diff --git a/examples/rust-dataflow-git/dataflow.yml b/examples/rust-dataflow-git/dataflow.yml new file mode 100644 index 00000000..f4bca5df --- /dev/null +++ b/examples/rust-dataflow-git/dataflow.yml @@ -0,0 +1,29 @@ +nodes: + - id: rust-node + git: https://github.com/dora-rs/dora.git + rev: e31b2a34 # pinned commit, update this when changing the message crate + build: cargo build -p rust-dataflow-example-node + path: target/debug/rust-dataflow-example-node + inputs: + tick: dora/timer/millis/10 + outputs: + - random + + - id: rust-status-node + git: https://github.com/dora-rs/dora.git + rev: e31b2a34 # pinned commit, update this when changing the message crate + build: cargo build -p rust-dataflow-example-status-node + path: target/debug/rust-dataflow-example-status-node + inputs: + tick: dora/timer/millis/100 + random: rust-node/random + outputs: + - status + + - id: rust-sink + git: https://github.com/dora-rs/dora.git + rev: e31b2a34 # pinned commit, update this when changing the message crate + build: cargo build -p rust-dataflow-example-sink + path: target/debug/rust-dataflow-example-sink + inputs: + message: rust-status-node/status diff --git a/examples/rust-dataflow-git/run.rs b/examples/rust-dataflow-git/run.rs new file mode 100644 index 00000000..213b65a0 --- /dev/null +++ b/examples/rust-dataflow-git/run.rs @@ -0,0 +1,52 @@ +use dora_tracing::set_up_tracing; +use eyre::{bail, Context}; +use std::path::Path; + +#[tokio::main] +async fn main() -> eyre::Result<()> { + set_up_tracing("rust-dataflow-runner").wrap_err("failed to set up tracing subscriber")?; + + let root = Path::new(env!("CARGO_MANIFEST_DIR")); + std::env::set_current_dir(root.join(file!()).parent().unwrap()) + .wrap_err("failed to set working dir")?; + + let args: Vec = std::env::args().collect(); + let dataflow = if args.len() > 1 { + Path::new(&args[1]) + } else { + Path::new("dataflow.yml") + }; + + build_dataflow(dataflow).await?; + + run_dataflow(dataflow).await?; + + Ok(()) +} + +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + +async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--") + .arg("daemon") + .arg("--run-dataflow") + .arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to run dataflow"); + }; + Ok(()) +} From 8334821ef7ed80ac5d714b6671459d1f4f84b26c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 2 Apr 2025 16:05:35 +0200 Subject: [PATCH 004/101] Use vendored `openssl` dependency for `git2` --- Cargo.lock | 10 ++++++++++ binaries/daemon/Cargo.toml | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 01c3a487..b54d3361 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7716,6 +7716,15 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "openssl-src" +version = "300.4.2+3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168ce4e058f975fe43e89d9ccf78ca668601887ae736090aacc23ae353c298e2" +dependencies = [ + "cc", +] + [[package]] name = "openssl-sys" version = "0.9.106" @@ -7724,6 +7733,7 @@ checksum = "8bb61ea9811cc39e3c2069f40b8b8e2e70d8569b361f879786cc7ed48b777cdd" dependencies = [ "cc", "libc", + "openssl-src", "pkg-config", "vcpkg", ] diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index f996a758..cd84d675 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -44,5 +44,5 @@ sysinfo = "0.30.11" crossbeam = "0.8.4" crossbeam-skiplist = "0.1.3" zenoh = "1.1.1" -git2 = "0.18.0" url = "2.5.4" +git2 = { version = "0.18.0", features = ["vendored-openssl"] } From 22a3f44164151a2651e2ba7eb17eda9c53bd403a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 2 Apr 2025 17:37:35 +0200 Subject: [PATCH 005/101] Use non-UNC paths on Windows for git operations --- Cargo.lock | 1 + binaries/daemon/Cargo.toml | 1 + binaries/daemon/src/spawn.rs | 2 ++ 3 files changed, 4 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index b54d3361..639c53b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3032,6 +3032,7 @@ dependencies = [ "dora-message", "dora-node-api", "dora-tracing", + "dunce", "eyre", "flume 0.10.14", "futures", diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index cd84d675..6b9f7381 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -46,3 +46,4 @@ crossbeam-skiplist = "0.1.3" zenoh = "1.1.1" url = "2.5.4" git2 = { version = "0.18.0", features = ["vendored-openssl"] } +dunce = "1.0.5" diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index ec640952..489400a4 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -563,6 +563,8 @@ impl Spawner<'_> { } else { clone_dir_base }; + let clone_dir = dunce::simplified(&clone_dir).to_owned(); + if clone_dir.exists() { let empty = BTreeSet::new(); let in_use = self.repos_in_use.get(&clone_dir).unwrap_or(&empty); From 0ce26ea4f931a689c12764e1f68872a3aaceeaae Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 2 Apr 2025 18:51:53 +0200 Subject: [PATCH 006/101] Create parent directory before clone operation Required on Windows --- binaries/daemon/src/spawn.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 489400a4..d532a688 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -644,6 +644,12 @@ async fn clone_into( clone_dir: &Path, logger: &mut NodeLogger<'_>, ) -> eyre::Result { + if let Some(parent) = clone_dir.parent() { + tokio::fs::create_dir_all(parent) + .await + .context("failed to create parent directory for git clone")?; + } + let rev_str = rev_str(rev); logger .log( From 37de4c234ecbaf424d25e6a0ad530987964c396d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 9 Apr 2025 14:00:33 +0200 Subject: [PATCH 007/101] Try before-script-linux --- .github/workflows/pip-release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pip-release.yml b/.github/workflows/pip-release.yml index 78820d5d..36550412 100644 --- a/.github/workflows/pip-release.yml +++ b/.github/workflows/pip-release.yml @@ -66,6 +66,7 @@ jobs: args: --release --out dist --zig manylinux: manylinux_2_28 working-directory: ${{ matrix.repository.path }} + before-script-linux: sudo apt install libatomic1 - name: Upload wheels if: github.event_name == 'release' uses: actions/upload-artifact@v4 From 09d4f143568caebb3658e57de984eae83bc634fb Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 9 Apr 2025 19:42:27 +0200 Subject: [PATCH 008/101] Also run build command for non-git nodes --- binaries/daemon/src/spawn.rs | 45 ++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index d532a688..f2100681 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -103,6 +103,10 @@ impl Spawner<'_> { dora_core::descriptor::CoreNodeKind::Custom(n) => { let command = match &n.source { dora_message::descriptor::NodeSource::Local => { + if let Some(build) = &n.build { + self.build_node(logger, &node.env, self.working_dir.clone(), build) + .await?; + } spawn_command_from_path(&self.working_dir, self.uv, logger, &n, true) .await? } @@ -598,25 +602,36 @@ impl Spawner<'_> { checkout_tree(&repository, refname)?; }; if let Some(build) = &node.build { - logger - .log( - LogLevel::Info, - None, - format!("running build command: `{build}"), - ) - .await; - let build = build.to_owned(); - let clone_dir = clone_dir.clone(); - let uv = self.uv; - let node_env = node_env.clone(); - let task = tokio::task::spawn_blocking(move || { - run_build_command(&build, &clone_dir, uv, &node_env).context("build command failed") - }); - task.await??; + self.build_node(logger, node_env, clone_dir.clone(), build) + .await?; } spawn_command_from_path(&clone_dir, self.uv, logger, node, true).await } + async fn build_node( + &mut self, + logger: &mut NodeLogger<'_>, + node_env: &Option>, + working_dir: PathBuf, + build: &String, + ) -> Result<(), eyre::Error> { + logger + .log( + LogLevel::Info, + None, + format!("running build command: `{build}"), + ) + .await; + let build = build.to_owned(); + let uv = self.uv; + let node_env = node_env.clone(); + let task = tokio::task::spawn_blocking(move || { + run_build_command(&build, &working_dir, uv, &node_env).context("build command failed") + }); + task.await??; + Ok(()) + } + fn used_by_other_dataflow( &mut self, dataflow_id: uuid::Uuid, From eed434fe22f6ef75e0ec7be878d3ed349e43284e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Thu, 10 Apr 2025 14:42:15 +0200 Subject: [PATCH 009/101] Add symlinks to 32-bit libatomic to fix build for x86 --- .github/workflows/pip-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pip-release.yml b/.github/workflows/pip-release.yml index 36550412..1d854096 100644 --- a/.github/workflows/pip-release.yml +++ b/.github/workflows/pip-release.yml @@ -66,7 +66,7 @@ jobs: args: --release --out dist --zig manylinux: manylinux_2_28 working-directory: ${{ matrix.repository.path }} - before-script-linux: sudo apt install libatomic1 + before-script-linux: sudo apt-get install libatomic1-i386-cross && mkdir -p $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/ && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so.1 && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 /opt/hostedtoolcache/Python/3.8.18/x64/lib/libatomic.so.1 - name: Upload wheels if: github.event_name == 'release' uses: actions/upload-artifact@v4 From a59428a3c010f3d716ff71242c1cc40b554f3d81 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 11 Apr 2025 15:15:57 +0200 Subject: [PATCH 010/101] Add libatomic for armv7 --- .github/workflows/pip-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pip-release.yml b/.github/workflows/pip-release.yml index 1d854096..f5ef5983 100644 --- a/.github/workflows/pip-release.yml +++ b/.github/workflows/pip-release.yml @@ -66,7 +66,7 @@ jobs: args: --release --out dist --zig manylinux: manylinux_2_28 working-directory: ${{ matrix.repository.path }} - before-script-linux: sudo apt-get install libatomic1-i386-cross && mkdir -p $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/ && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so.1 && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 /opt/hostedtoolcache/Python/3.8.18/x64/lib/libatomic.so.1 + before-script-linux: sudo apt-get install libatomic1-i386-cross libatomic1-armhf-cross && mkdir -p $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/ && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/i686-unknown-linux-gnu/lib/libatomic.so.1 && ln -s /usr/i686-linux-gnu/lib/libatomic.so.1 /opt/hostedtoolcache/Python/3.8.18/x64/lib/libatomic.so.1 && mkdir -p $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/armv7-unknown-linux-gnueabihf/lib/ && ln -s /usr/arm-linux-gnueabihf/lib/libatomic.so.1 $HOME/.rustup/toolchains/1.84-x86_64-unknown-linux-gnu/lib/rustlib/armv7-unknown-linux-gnueabihf/lib/libatomic.so - name: Upload wheels if: github.event_name == 'release' uses: actions/upload-artifact@v4 From c0ac912fc20e67a2ea0b1dcb8f20d908f448906b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 11 Apr 2025 16:11:36 +0200 Subject: [PATCH 011/101] Simplify UNC working directory paths on Windows Fixes the tests/examples of this repo. A UNC path as working directory doesn't work because the build commands of our examples invoke `cargo` on the dora repo. The dora repo uses wildcards in the member list in the top-level Cargo.toml, which are ignored in UNC-paths. Thus, `failed to load manifest for workspace member` errors occur. --- Cargo.lock | 1 + libraries/core/Cargo.toml | 1 + libraries/core/src/build.rs | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 639c53b1..919ab7e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3002,6 +3002,7 @@ name = "dora-core" version = "0.3.10" dependencies = [ "dora-message", + "dunce", "eyre", "log", "once_cell", diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 8ad7952a..7d9233b1 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -23,3 +23,4 @@ tokio = { version = "1.24.1", features = ["fs", "process", "sync"] } schemars = "0.8.19" serde_json = "1.0.117" log = { version = "0.4.21", features = ["serde"] } +dunce = "1.0.5" diff --git a/libraries/core/src/build.rs b/libraries/core/src/build.rs index 2d900536..7672b66e 100644 --- a/libraries/core/src/build.rs +++ b/libraries/core/src/build.rs @@ -33,7 +33,7 @@ pub fn run_build_command( } } - cmd.current_dir(working_dir); + cmd.current_dir(dunce::simplified(working_dir)); let exit_status = cmd .status() .wrap_err_with(|| format!("failed to run `{}`", build))?; From b0bcef0b212317f43f15d64fd46f726900f94411 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 15 Apr 2025 13:24:04 +0200 Subject: [PATCH 012/101] Extend sleep duration in 'Test CLI' CI job The `dora start` command will now also trigger a build, so the whole thing takes longer. --- .github/workflows/ci.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 296b9027..e988d89a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -352,12 +352,13 @@ jobs: uv run pytest export OPERATING_MODE=SAVE + echo "Running dora up" dora up + echo "Running dora list" dora list - dora build dataflow.yml --uv echo "Running CI Python Test" dora start dataflow.yml --name ci-python-test --detach --uv - sleep 10 + sleep 60 dora stop --name ci-python-test --grace-duration 5s dora destroy @@ -370,7 +371,7 @@ jobs: uv pip install -e apis/python/node dora build examples/python-dataflow/dataflow.yml --uv dora start examples/python-dataflow/dataflow.yml --name ci-python --detach --uv - sleep 10 + sleep 60 dora stop --name ci-python --grace-duration 30s # Run Python Dynamic Node Example @@ -378,13 +379,13 @@ jobs: dora build examples/python-dataflow/dataflow_dynamic.yml --uv dora start examples/python-dataflow/dataflow_dynamic.yml --name ci-python-dynamic --detach --uv uv run opencv-plot --name plot - sleep 10 + sleep 60 dora stop --name ci-python-dynamic --grace-duration 30s # Run Python Operator Example echo "Running CI Operator Test" dora start examples/python-operator-dataflow/dataflow.yml --name ci-python-operator --detach --uv - sleep 10 + sleep 60 dora stop --name ci-python-operator --grace-duration 30s dora destroy From 3dd13b87c0ddeda8609fc1fe55dcb553a7c8cb8e Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 15 Apr 2025 16:46:18 +0200 Subject: [PATCH 013/101] More clone options for loggers --- binaries/daemon/src/log.rs | 55 +++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index c9e41334..ef45452c 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -1,4 +1,5 @@ use std::{ + ops::{Deref, DerefMut}, path::{Path, PathBuf}, sync::Arc, }; @@ -39,11 +40,18 @@ impl NodeLogger<'_> { .log(level, Some(self.node_id.clone()), target, message) .await } + + pub async fn try_clone(&self) -> eyre::Result> { + Ok(NodeLogger { + node_id: self.node_id.clone(), + logger: self.logger.try_clone().await?, + }) + } } pub struct DataflowLogger<'a> { dataflow_id: Uuid, - logger: &'a mut DaemonLogger, + logger: CowMut<'a, DaemonLogger>, } impl<'a> DataflowLogger<'a> { @@ -57,12 +65,12 @@ impl<'a> DataflowLogger<'a> { pub fn reborrow(&mut self) -> DataflowLogger { DataflowLogger { dataflow_id: self.dataflow_id, - logger: self.logger, + logger: CowMut::Borrowed(&mut self.logger), } } pub fn inner(&self) -> &DaemonLogger { - self.logger + &self.logger } pub async fn log( @@ -76,6 +84,13 @@ impl<'a> DataflowLogger<'a> { .log(level, self.dataflow_id, node_id, target, message) .await } + + pub async fn try_clone(&self) -> eyre::Result> { + Ok(DataflowLogger { + dataflow_id: self.dataflow_id, + logger: CowMut::Owned(self.logger.try_clone().await?), + }) + } } pub struct DaemonLogger { @@ -87,7 +102,7 @@ impl DaemonLogger { pub fn for_dataflow(&mut self, dataflow_id: Uuid) -> DataflowLogger { DataflowLogger { dataflow_id, - logger: self, + logger: CowMut::Borrowed(self), } } @@ -120,6 +135,13 @@ impl DaemonLogger { pub(crate) fn daemon_id(&self) -> &DaemonId { &self.daemon_id } + + pub async fn try_clone(&self) -> eyre::Result { + Ok(Self { + daemon_id: self.daemon_id.clone(), + logger: self.logger.try_clone().await?, + }) + } } pub struct Logger { @@ -207,3 +229,28 @@ impl Logger { }) } } + +enum CowMut<'a, T> { + Borrowed(&'a mut T), + Owned(T), +} + +impl Deref for CowMut<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + CowMut::Borrowed(v) => v, + CowMut::Owned(v) => v, + } + } +} + +impl DerefMut for CowMut<'_, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + match self { + CowMut::Borrowed(v) => v, + CowMut::Owned(v) => v, + } + } +} From dd2ce5ed8748ad96b1773c309c84f31de1273dab Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 15 Apr 2025 16:52:16 +0200 Subject: [PATCH 014/101] Move `repos_in_use` to parameter list to make `Spawner` clonable --- binaries/daemon/src/lib.rs | 8 ++++++-- binaries/daemon/src/spawn.rs | 26 ++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 827e670a..d4304281 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -744,7 +744,6 @@ impl Daemon { dataflow_descriptor, clock: self.clock.clone(), uv, - repos_in_use: &mut self.repos_in_use, }; // spawn nodes and set up subscriptions @@ -768,7 +767,12 @@ impl Daemon { .log(LogLevel::Info, Some("daemon".into()), "spawning") .await; match spawner - .spawn_node(node, node_stderr_most_recent, &mut logger) + .spawn_node( + node, + node_stderr_most_recent, + &mut logger, + &mut self.repos_in_use, + ) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index f2100681..a44b3fdc 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -45,7 +45,8 @@ use tokio::{ use tracing::error; use url::Url; -pub struct Spawner<'a> { +#[derive(Clone)] +pub struct Spawner { pub dataflow_id: DataflowId, pub working_dir: PathBuf, pub daemon_tx: mpsc::Sender>, @@ -53,15 +54,15 @@ pub struct Spawner<'a> { /// clock is required for generating timestamps when dropping messages early because queue is full pub clock: Arc, pub uv: bool, - pub repos_in_use: &'a mut BTreeMap>, } -impl Spawner<'_> { +impl Spawner { pub async fn spawn_node( &mut self, node: ResolvedNode, node_stderr_most_recent: Arc>, logger: &mut NodeLogger<'_>, + repos_in_use: &mut BTreeMap>, ) -> eyre::Result { let dataflow_id = self.dataflow_id; let node_id = node.id.clone(); @@ -111,7 +112,7 @@ impl Spawner<'_> { .await? } dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { - self.spawn_git_node(&n, repo, rev, logger, &node.env) + self.spawn_git_node(&n, repo, rev, logger, &node.env, repos_in_use) .await? } }; @@ -518,6 +519,7 @@ impl Spawner<'_> { rev: &Option, logger: &mut NodeLogger<'_>, node_env: &Option>, + repos_in_use: &mut BTreeMap>, ) -> Result, eyre::Error> { let dataflow_id = self.dataflow_id; let repo_url = Url::parse(repo_addr).context("failed to parse git repository URL")?; @@ -546,7 +548,8 @@ impl Spawner<'_> { } }; let clone_dir = if clone_dir_base.exists() { - let used_by_other_dataflow = self.used_by_other_dataflow(dataflow_id, &clone_dir_base); + let used_by_other_dataflow = + self.used_by_other_dataflow(dataflow_id, &clone_dir_base, repos_in_use); if used_by_other_dataflow { // don't reuse, choose new directory // (TODO reuse if still up to date) @@ -555,7 +558,9 @@ impl Spawner<'_> { let mut i = 1; loop { let new_path = clone_dir_base.with_file_name(format!("{dir_name}-{i}")); - if new_path.exists() && self.used_by_other_dataflow(dataflow_id, &new_path) { + if new_path.exists() + && self.used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) + { i += 1; } else { break new_path; @@ -571,13 +576,13 @@ impl Spawner<'_> { if clone_dir.exists() { let empty = BTreeSet::new(); - let in_use = self.repos_in_use.get(&clone_dir).unwrap_or(&empty); + let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); if used_by_other_dataflow { // TODO allow if still up to date eyre::bail!("clone_dir is already in use by other dataflow") } else { - self.repos_in_use + repos_in_use .entry(clone_dir.clone()) .or_default() .insert(dataflow_id); @@ -594,7 +599,7 @@ impl Spawner<'_> { checkout_tree(&repository, refname)?; } } else { - self.repos_in_use + repos_in_use .entry(clone_dir.clone()) .or_default() .insert(dataflow_id); @@ -636,9 +641,10 @@ impl Spawner<'_> { &mut self, dataflow_id: uuid::Uuid, clone_dir_base: &PathBuf, + repos_in_use: &mut BTreeMap>, ) -> bool { let empty = BTreeSet::new(); - let in_use = self.repos_in_use.get(clone_dir_base).unwrap_or(&empty); + let in_use = repos_in_use.get(clone_dir_base).unwrap_or(&empty); let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); used_by_other_dataflow } From e67472f99432df7b1699c0b60d273c6cc33182dc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 15 Apr 2025 18:40:46 +0200 Subject: [PATCH 015/101] Report spawn result asynchronously to avoid long blocking --- binaries/daemon/src/lib.rs | 94 +++++++++++++++++++++--- binaries/daemon/src/spawn.rs | 137 +++++++++++++++++++++++++---------- 2 files changed, 182 insertions(+), 49 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index d4304281..1d1e5656 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -38,6 +38,7 @@ use socket_stream_utils::socket_stream_send; use spawn::Spawner; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, + future::Future, net::SocketAddr, path::{Path, PathBuf}, pin::pin, @@ -54,6 +55,7 @@ use tokio::{ mpsc::{self, UnboundedSender}, oneshot::{self, Sender}, }, + task::JoinSet, }; use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use tracing::{error, warn}; @@ -384,6 +386,24 @@ impl Daemon { Event::DaemonError(err) => { tracing::error!("Daemon error: {err:?}"); } + Event::SpawnNodeResult { + dataflow_id, + node_id, + result, + } => match result { + Ok(running_node) => { + if let Some(dataflow) = self.running.get_mut(&dataflow_id) { + dataflow.running_nodes.insert(node_id, running_node); + } + } + Err(error) => { + self.dataflow_node_results + .entry(dataflow_id) + .or_default() + .insert(node_id.clone(), Err(error)); + self.handle_node_stop(dataflow_id, &node_id).await?; + } + }, } } @@ -441,10 +461,17 @@ impl Daemon { if let Err(err) = &result { tracing::error!("{err:?}"); } - let reply = - DaemonCoordinatorReply::SpawnResult(result.map_err(|err| format!("{err:?}"))); - let _ = reply_tx.send(Some(reply)).map_err(|_| { - error!("could not send `SpawnResult` reply from daemon to coordinator") + tokio::spawn(async move { + let result = match result { + Err(err) => Err(err), + Ok(task) => task.await, + }; + let reply = DaemonCoordinatorReply::SpawnResult( + result.map_err(|err| format!("{err:?}")), + ); + let _ = reply_tx.send(Some(reply)).map_err(|_| { + error!("could not send `SpawnResult` reply from daemon to coordinator") + }); }); RunStatus::Continue } @@ -685,7 +712,7 @@ impl Daemon { dataflow_descriptor: Descriptor, spawn_nodes: BTreeSet, uv: bool, - ) -> eyre::Result<()> { + ) -> eyre::Result>> { let mut logger = self.logger.for_dataflow(dataflow_id); let dataflow = RunningDataflow::new(dataflow_id, self.daemon_id.clone(), &dataflow_descriptor); @@ -737,7 +764,7 @@ impl Daemon { } } - let mut spawner = Spawner { + let spawner = Spawner { dataflow_id, working_dir, daemon_tx: self.events_tx.clone(), @@ -746,6 +773,8 @@ impl Daemon { uv, }; + let mut tasks = JoinSet::new(); + // spawn nodes and set up subscriptions for node in nodes.into_values() { let mut logger = logger.reborrow().for_node(node.id.clone()); @@ -767,6 +796,7 @@ impl Daemon { .log(LogLevel::Info, Some("daemon".into()), "spawning") .await; match spawner + .clone() .spawn_node( node, node_stderr_most_recent, @@ -776,8 +806,43 @@ impl Daemon { .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { - Ok(running_node) => { - dataflow.running_nodes.insert(node_id, running_node); + Ok(spawn_task) => { + let events_tx = self.events_tx.clone(); + let clock = self.clock.clone(); + tasks.spawn(async move { + let result = spawn_task.await.unwrap_or_else(|err| { + Err(eyre!("failed to join spawn task: {err}")) + }); + let (node_spawn_result, success) = match result { + Ok(node) => (Ok(node), Ok(())), + Err(err) => { + let node_err = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::Other { + stderr: format!("spawn failed: {err:?}"), + }, + exit_status: NodeExitStatus::Unknown, + }; + (Err(node_err), Err(err)) + } + }; + let send_result = events_tx + .send(Timestamped { + inner: Event::SpawnNodeResult { + dataflow_id, + node_id, + result: node_spawn_result, + }, + timestamp: clock.new_timestamp(), + }) + .await; + if send_result.is_err() { + tracing::error!( + "failed to send SpawnNodeResult to main daemon task" + ) + } + success + }); } Err(err) => { logger @@ -858,7 +923,11 @@ impl Daemon { self.handle_node_stop(dataflow_id, &node_id).await?; } - Ok(()) + let spawn_result = async move { + let result: eyre::Result<()> = tasks.join_all().await.into_iter().collect(); + result + }; + Ok(spawn_result) } async fn handle_dynamic_node_event( @@ -1713,7 +1782,7 @@ fn close_input( } #[derive(Debug)] -struct RunningNode { +pub struct RunningNode { pid: Option, node_config: NodeConfig, } @@ -2017,6 +2086,11 @@ pub enum Event { CtrlC, SecondCtrlC, DaemonError(eyre::Report), + SpawnNodeResult { + dataflow_id: DataflowId, + node_id: NodeId, + result: Result, + }, } impl From for Event { diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index a44b3fdc..883ceb7d 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -10,8 +10,8 @@ use dora_core::{ build::run_build_command, config::DataId, descriptor::{ - resolve_path, source_is_url, Descriptor, OperatorDefinition, OperatorSource, PythonSource, - ResolvedNode, ResolvedNodeExt, DYNAMIC_SOURCE, SHELL_SOURCE, + resolve_path, source_is_url, CustomNode, Descriptor, OperatorDefinition, OperatorSource, + PythonSource, ResolvedNode, ResolvedNodeExt, DYNAMIC_SOURCE, SHELL_SOURCE, }, get_python_path, uhlc::HLC, @@ -58,12 +58,12 @@ pub struct Spawner { impl Spawner { pub async fn spawn_node( - &mut self, + mut self, node: ResolvedNode, node_stderr_most_recent: Arc>, logger: &mut NodeLogger<'_>, repos_in_use: &mut BTreeMap>, - ) -> eyre::Result { + ) -> eyre::Result>> { let dataflow_id = self.dataflow_id; let node_id = node.id.clone(); logger @@ -87,9 +87,6 @@ impl Spawner { self.clock.clone(), ) .await?; - let send_stdout_to = node - .send_stdout_as() - .context("Could not resolve `send_stdout_as` configuration")?; let node_config = NodeConfig { dataflow_id, @@ -100,6 +97,47 @@ impl Spawner { dynamic: node.kind.dynamic(), }; + let prepared_git = if let dora_core::descriptor::CoreNodeKind::Custom(CustomNode { + source: dora_message::descriptor::NodeSource::GitBranch { repo, rev }, + .. + }) = &node.kind + { + Some(self.prepare_git_node(repo, rev, repos_in_use).await?) + } else { + None + }; + + let mut logger = logger + .try_clone() + .await + .wrap_err("failed to clone logger")?; + let task = async move { + self.spawn_node_inner( + node, + &mut logger, + dataflow_id, + node_config, + prepared_git, + node_stderr_most_recent, + ) + .await + }; + Ok(tokio::spawn(task)) + } + + async fn spawn_node_inner( + &mut self, + node: ResolvedNode, + logger: &mut NodeLogger<'_>, + dataflow_id: uuid::Uuid, + node_config: NodeConfig, + prepared_git: Option, + node_stderr_most_recent: Arc>, + ) -> Result { + let send_stdout_to = node + .send_stdout_as() + .context("Could not resolve `send_stdout_as` configuration")?; + let mut child = match node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { let command = match &n.source { @@ -112,7 +150,7 @@ impl Spawner { .await? } dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { - self.spawn_git_node(&n, repo, rev, logger, &node.env, repos_in_use) + self.spawn_git_node(&n, repo, rev, logger, &node.env, prepared_git.unwrap()) .await? } }; @@ -442,6 +480,7 @@ impl Spawner { .try_clone() .await .context("failed to clone logger")?; + // Log to file stream. tokio::spawn(async move { while let Some(message) = rx.recv().await { @@ -512,24 +551,16 @@ impl Spawner { Ok(running_node) } - async fn spawn_git_node( + async fn prepare_git_node( &mut self, - node: &dora_core::descriptor::CustomNode, repo_addr: &String, rev: &Option, - logger: &mut NodeLogger<'_>, - node_env: &Option>, repos_in_use: &mut BTreeMap>, - ) -> Result, eyre::Error> { + ) -> eyre::Result { let dataflow_id = self.dataflow_id; let repo_url = Url::parse(repo_addr).context("failed to parse git repository URL")?; let target_dir = self.working_dir.join("build"); - let rev_str = rev_str(rev); - let refname = rev.clone().map(|rev| match rev { - GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), - GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), - GitRepoRev::Rev(rev) => rev, - }); + let clone_dir_base = { let base = { let mut path = @@ -574,7 +605,7 @@ impl Spawner { }; let clone_dir = dunce::simplified(&clone_dir).to_owned(); - if clone_dir.exists() { + let reuse = if clone_dir.exists() { let empty = BTreeSet::new(); let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); @@ -582,27 +613,50 @@ impl Spawner { // TODO allow if still up to date eyre::bail!("clone_dir is already in use by other dataflow") } else { - repos_in_use - .entry(clone_dir.clone()) - .or_default() - .insert(dataflow_id); - logger - .log( - LogLevel::Info, - None, - format!("reusing {repo_addr}{rev_str}"), - ) - .await; - let refname_cloned = refname.clone(); - let clone_dir = clone_dir.clone(); - let repository = fetch_changes(clone_dir, refname_cloned).await?; - checkout_tree(&repository, refname)?; + true } } else { - repos_in_use - .entry(clone_dir.clone()) - .or_default() - .insert(dataflow_id); + false + }; + repos_in_use + .entry(clone_dir.clone()) + .or_default() + .insert(dataflow_id); + + Ok(PreparedGit { clone_dir, reuse }) + } + + async fn spawn_git_node( + &mut self, + node: &dora_core::descriptor::CustomNode, + repo_addr: &String, + rev: &Option, + logger: &mut NodeLogger<'_>, + node_env: &Option>, + prepared: PreparedGit, + ) -> Result, eyre::Error> { + let PreparedGit { clone_dir, reuse } = prepared; + + let rev_str = rev_str(rev); + let refname = rev.clone().map(|rev| match rev { + GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), + GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), + GitRepoRev::Rev(rev) => rev, + }); + + if reuse { + logger + .log( + LogLevel::Info, + None, + format!("reusing {repo_addr}{rev_str}"), + ) + .await; + let refname_cloned = refname.clone(); + let clone_dir = clone_dir.clone(); + let repository = fetch_changes(clone_dir, refname_cloned).await?; + checkout_tree(&repository, refname)?; + } else { let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; checkout_tree(&repository, refname)?; }; @@ -849,3 +903,8 @@ async fn spawn_command_from_path( Ok(Some(cmd)) } + +struct PreparedGit { + clone_dir: PathBuf, + reuse: bool, +} From d64584bf97ef361e1e9d072198c4f42771e7216c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 15 Apr 2025 20:01:38 +0200 Subject: [PATCH 016/101] Avoid concurrent checkout by only checking out first use within the dataflow --- binaries/daemon/src/spawn.rs | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 883ceb7d..8c875f10 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -605,25 +605,31 @@ impl Spawner { }; let clone_dir = dunce::simplified(&clone_dir).to_owned(); - let reuse = if clone_dir.exists() { + let (reuse, checkout) = if clone_dir.exists() { let empty = BTreeSet::new(); let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); if used_by_other_dataflow { // TODO allow if still up to date eyre::bail!("clone_dir is already in use by other dataflow") + } else if in_use.is_empty() { + (true, true) } else { - true + (true, false) } } else { - false + (false, true) }; repos_in_use .entry(clone_dir.clone()) .or_default() .insert(dataflow_id); - Ok(PreparedGit { clone_dir, reuse }) + Ok(PreparedGit { + clone_dir, + reuse, + checkout, + }) } async fn spawn_git_node( @@ -635,7 +641,11 @@ impl Spawner { node_env: &Option>, prepared: PreparedGit, ) -> Result, eyre::Error> { - let PreparedGit { clone_dir, reuse } = prepared; + let PreparedGit { + clone_dir, + reuse, + checkout, + } = prepared; let rev_str = rev_str(rev); let refname = rev.clone().map(|rev| match rev { @@ -655,10 +665,14 @@ impl Spawner { let refname_cloned = refname.clone(); let clone_dir = clone_dir.clone(); let repository = fetch_changes(clone_dir, refname_cloned).await?; - checkout_tree(&repository, refname)?; + if checkout { + checkout_tree(&repository, refname)?; + } } else { let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; - checkout_tree(&repository, refname)?; + if checkout { + checkout_tree(&repository, refname)?; + } }; if let Some(build) = &node.build { self.build_node(logger, node_env, clone_dir.clone(), build) @@ -907,4 +921,5 @@ async fn spawn_command_from_path( struct PreparedGit { clone_dir: PathBuf, reuse: bool, + checkout: bool, } From ea3793cb521b203aefdcd93fd5b413eb7b29aa76 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 10:39:25 +0200 Subject: [PATCH 017/101] Log when receiving SpawnNodeResult with invalid/stopped dataflow ID --- binaries/daemon/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 1d1e5656..b8d4fa71 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -394,6 +394,8 @@ impl Daemon { Ok(running_node) => { if let Some(dataflow) = self.running.get_mut(&dataflow_id) { dataflow.running_nodes.insert(node_id, running_node); + } else { + tracing::error!("failed to handle SpawnNodeResult: no running dataflow with ID {dataflow_id}"); } } Err(error) => { From 4e908d170981ec01ab4dda2878041d08ff08da76 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 10:39:43 +0200 Subject: [PATCH 018/101] Don't finish dataflow while there are still pending nodes --- binaries/daemon/src/lib.rs | 9 +++++---- binaries/daemon/src/pending.rs | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index b8d4fa71..c759d63f 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -1377,10 +1377,11 @@ impl Daemon { if let Some(mut pid) = dataflow.running_nodes.remove(node_id).and_then(|n| n.pid) { pid.mark_as_stopped() } - if dataflow - .running_nodes - .iter() - .all(|(_id, n)| n.node_config.dynamic) + if !dataflow.pending_nodes.local_nodes_pending() + && dataflow + .running_nodes + .iter() + .all(|(_id, n)| n.node_config.dynamic) { let result = DataflowDaemonResult { timestamp: self.clock.new_timestamp(), diff --git a/binaries/daemon/src/pending.rs b/binaries/daemon/src/pending.rs index 89305d80..757a858d 100644 --- a/binaries/daemon/src/pending.rs +++ b/binaries/daemon/src/pending.rs @@ -59,6 +59,10 @@ impl PendingNodes { self.external_nodes = value; } + pub fn local_nodes_pending(&self) -> bool { + !self.local_nodes.is_empty() + } + pub async fn handle_node_subscription( &mut self, node_id: NodeId, From d27295747360a9cdf7e1052185028a78a94fa1cf Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 11:11:21 +0200 Subject: [PATCH 019/101] Also consider `repos_in_use` when checking whether `clone_dir` exists The actual directory creation and clone call will happen later in an asynchronous task, but we still want subsequent nodes to treat this directory as used. Otherwise, multiple nodes might try to clone into the same directory, which can lead to errors. --- binaries/daemon/src/spawn.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 8c875f10..55be0491 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -32,7 +32,7 @@ use dora_node_api::{ use eyre::{ContextCompat, WrapErr}; use git2::FetchOptions; use std::{ - collections::{BTreeMap, BTreeSet}, + collections::{BTreeMap, BTreeSet, HashMap}, path::{Path, PathBuf}, process::Stdio, sync::Arc, @@ -44,6 +44,7 @@ use tokio::{ }; use tracing::error; use url::Url; +use uuid::Uuid; #[derive(Clone)] pub struct Spawner { @@ -578,7 +579,7 @@ impl Spawner { }, } }; - let clone_dir = if clone_dir_base.exists() { + let clone_dir = if clone_dir_exists(&clone_dir_base, repos_in_use) { let used_by_other_dataflow = self.used_by_other_dataflow(dataflow_id, &clone_dir_base, repos_in_use); if used_by_other_dataflow { @@ -589,7 +590,7 @@ impl Spawner { let mut i = 1; loop { let new_path = clone_dir_base.with_file_name(format!("{dir_name}-{i}")); - if new_path.exists() + if clone_dir_exists(&new_path, repos_in_use) && self.used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) { i += 1; @@ -605,7 +606,7 @@ impl Spawner { }; let clone_dir = dunce::simplified(&clone_dir).to_owned(); - let (reuse, checkout) = if clone_dir.exists() { + let (reuse, checkout) = if clone_dir_exists(&clone_dir, repos_in_use) { let empty = BTreeSet::new(); let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); @@ -923,3 +924,7 @@ struct PreparedGit { reuse: bool, checkout: bool, } + +fn clone_dir_exists(dir: &PathBuf, repos_in_use: &BTreeMap>) -> bool { + repos_in_use.contains_key(dir) || dir.exists() +} From 1659d354daef86223e31b2af39abec4d04fed59d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 11:30:38 +0200 Subject: [PATCH 020/101] Spawn, clone, and build nodes sequentially Spawn nodes one-by-one instead of doing it concurrently. Subsequent nodes might want to reuse git repos or build artifacts of previous nodes. Spawning them in parallel might lead to conflicts and errors (e.g. because a git repo reuse is handled before a git clone or because some build command is not concurrency-safe). --- binaries/daemon/src/lib.rs | 19 +++++++++---------- binaries/daemon/src/spawn.rs | 7 ++++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index c759d63f..35ce54c4 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -55,7 +55,6 @@ use tokio::{ mpsc::{self, UnboundedSender}, oneshot::{self, Sender}, }, - task::JoinSet, }; use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use tracing::{error, warn}; @@ -775,7 +774,7 @@ impl Daemon { uv, }; - let mut tasks = JoinSet::new(); + let mut tasks = Vec::new(); // spawn nodes and set up subscriptions for node in nodes.into_values() { @@ -808,14 +807,11 @@ impl Daemon { .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { - Ok(spawn_task) => { + Ok(result) => { let events_tx = self.events_tx.clone(); let clock = self.clock.clone(); - tasks.spawn(async move { - let result = spawn_task.await.unwrap_or_else(|err| { - Err(eyre!("failed to join spawn task: {err}")) - }); - let (node_spawn_result, success) = match result { + tasks.push(async move { + let (node_spawn_result, success) = match result.await { Ok(node) => (Ok(node), Ok(())), Err(err) => { let node_err = NodeError { @@ -926,9 +922,12 @@ impl Daemon { } let spawn_result = async move { - let result: eyre::Result<()> = tasks.join_all().await.into_iter().collect(); - result + for task in tasks { + task.await?; + } + Ok(()) }; + Ok(spawn_result) } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 55be0491..3927eb3c 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -32,7 +32,8 @@ use dora_node_api::{ use eyre::{ContextCompat, WrapErr}; use git2::FetchOptions; use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, + collections::{BTreeMap, BTreeSet}, + future::Future, path::{Path, PathBuf}, process::Stdio, sync::Arc, @@ -64,7 +65,7 @@ impl Spawner { node_stderr_most_recent: Arc>, logger: &mut NodeLogger<'_>, repos_in_use: &mut BTreeMap>, - ) -> eyre::Result>> { + ) -> eyre::Result>> { let dataflow_id = self.dataflow_id; let node_id = node.id.clone(); logger @@ -123,7 +124,7 @@ impl Spawner { ) .await }; - Ok(tokio::spawn(task)) + Ok(task) } async fn spawn_node_inner( From 343588d53a7f36749d00d4acf2c2cd2597feb282 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 11:49:21 +0200 Subject: [PATCH 021/101] Add build command to Python CLI test again Shortens the time required for dora start, which allows us to keep the sleep duration lower. --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e988d89a..f84afaad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -356,6 +356,7 @@ jobs: dora up echo "Running dora list" dora list + dora build dataflow.yml --uv echo "Running CI Python Test" dora start dataflow.yml --name ci-python-test --detach --uv sleep 60 From 09b69a10630314acf34b7fa1b5694cf38e9e16fd Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 12:36:59 +0200 Subject: [PATCH 022/101] Sleep a bit after `dora destroy` to avoid races --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f84afaad..7b52f16d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -362,6 +362,7 @@ jobs: sleep 60 dora stop --name ci-python-test --grace-duration 5s dora destroy + sleep 5 cd .. @@ -390,6 +391,7 @@ jobs: dora stop --name ci-python-operator --grace-duration 30s dora destroy + sleep 5 # Run Python queue latency test echo "Running CI Queue Latency Test" From 247d4f9d03c2970c3e43215fae90f7b39982f1b6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 13:05:41 +0200 Subject: [PATCH 023/101] Longer sleeps in Python CLI example --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b52f16d..4b6e008f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -381,13 +381,13 @@ jobs: dora build examples/python-dataflow/dataflow_dynamic.yml --uv dora start examples/python-dataflow/dataflow_dynamic.yml --name ci-python-dynamic --detach --uv uv run opencv-plot --name plot - sleep 60 + sleep 240 dora stop --name ci-python-dynamic --grace-duration 30s # Run Python Operator Example echo "Running CI Operator Test" dora start examples/python-operator-dataflow/dataflow.yml --name ci-python-operator --detach --uv - sleep 60 + sleep 120 dora stop --name ci-python-operator --grace-duration 30s dora destroy From d9c18fd82b6e4fcee86c5203c3f176b5e5105aaf Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 13:53:28 +0200 Subject: [PATCH 024/101] CI: Print when running `dora stop` --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b6e008f..7c819c9f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -360,6 +360,7 @@ jobs: echo "Running CI Python Test" dora start dataflow.yml --name ci-python-test --detach --uv sleep 60 + echo "Running dora stop" dora stop --name ci-python-test --grace-duration 5s dora destroy sleep 5 @@ -374,6 +375,7 @@ jobs: dora build examples/python-dataflow/dataflow.yml --uv dora start examples/python-dataflow/dataflow.yml --name ci-python --detach --uv sleep 60 + echo "Running dora stop" dora stop --name ci-python --grace-duration 30s # Run Python Dynamic Node Example @@ -382,12 +384,14 @@ jobs: dora start examples/python-dataflow/dataflow_dynamic.yml --name ci-python-dynamic --detach --uv uv run opencv-plot --name plot sleep 240 + echo "Running dora stop" dora stop --name ci-python-dynamic --grace-duration 30s # Run Python Operator Example echo "Running CI Operator Test" dora start examples/python-operator-dataflow/dataflow.yml --name ci-python-operator --detach --uv sleep 120 + echo "Running dora stop" dora stop --name ci-python-operator --grace-duration 30s dora destroy From 63c548991f19a3eaa51fe9fa25b90eeb424e4722 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 14:24:01 +0200 Subject: [PATCH 025/101] Warn if event handling takes too long in daemon or coordinator --- binaries/coordinator/src/lib.rs | 11 +++++++++++ binaries/daemon/src/lib.rs | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index e002f859..cdad4ebd 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -201,6 +201,9 @@ async fn start_inner( let mut daemon_connections = DaemonConnections::default(); while let Some(event) = events.next().await { + let start = Instant::now(); + let event_debug = format!("{event:?}"); + if event.log() { tracing::trace!("Handling event {event:?}"); } @@ -704,6 +707,14 @@ async fn start_inner( daemon_connections.remove(&daemon_id); } } + + let elapsed = start.elapsed(); + if elapsed > Duration::from_millis(100) { + tracing::warn!( + "Coordinator took {}ms for handling event: {event_debug}", + elapsed.as_millis() + ); + } } tracing::info!("stopped"); diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 35ce54c4..1e4ca636 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -320,6 +320,9 @@ impl Daemon { tracing::warn!("failed to update HLC with incoming event timestamp: {err}"); } + let start = Instant::now(); + let event_debug = format!("{inner:?}"); + match inner { Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { let status = self.handle_coordinator_event(event, reply_tx).await?; @@ -406,6 +409,14 @@ impl Daemon { } }, } + + let elapsed = start.elapsed(); + if elapsed > Duration::from_millis(100) { + tracing::warn!( + "Daemon took {}ms for handling event: {event_debug}", + elapsed.as_millis() + ); + } } if let Some(mut connection) = self.coordinator_connection.take() { From 49b87daeae419eb36335e3a73e0707dbb117e4d8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 15:25:14 +0200 Subject: [PATCH 026/101] Handle dataflow spawn asynchronously in coordinator too --- binaries/coordinator/src/lib.rs | 58 +++++++++++++-- binaries/coordinator/src/listener.rs | 13 ++++ binaries/coordinator/src/run/mod.rs | 5 +- binaries/daemon/src/lib.rs | 70 +++++++++++++++---- .../message/src/daemon_to_coordinator.rs | 6 +- 5 files changed, 127 insertions(+), 25 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index cdad4ebd..772f3945 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -421,12 +421,15 @@ async fn start_inner( .await?; Ok(dataflow) }; - let reply = inner.await.map(|dataflow| { - let uuid = dataflow.uuid; - running_dataflows.insert(uuid, dataflow); - ControlRequestReply::DataflowStarted { uuid } - }); - let _ = reply_sender.send(reply); + match inner.await { + Ok(mut dataflow) => { + dataflow.spawn_result_tx = Some(reply_sender); + running_dataflows.insert(dataflow.uuid, dataflow); + } + Err(err) => { + let _ = reply_sender.send(Err(err)); + } + } } ControlRequest::Check { dataflow_uuid } => { let status = match &running_dataflows.get(&dataflow_uuid) { @@ -706,6 +709,37 @@ async fn start_inner( tracing::info!("Daemon `{daemon_id}` exited"); daemon_connections.remove(&daemon_id); } + Event::DataflowSpawnResult { + dataflow_id, + daemon_id, + result, + } => match running_dataflows.get_mut(&dataflow_id) { + Some(dataflow) => { + dataflow.pending_spawn_results.remove(&daemon_id); + match result { + Ok(()) => { + if dataflow.pending_spawn_results.is_empty() { + tracing::info!("successfully spawned dataflow `{dataflow_id}`"); + if let Some(reply_tx) = dataflow.spawn_result_tx.take() { + let _ = + reply_tx.send(Ok(ControlRequestReply::DataflowStarted { + uuid: dataflow_id, + })); + } + } + } + Err(err) => { + tracing::warn!("error while spawning dataflow `{dataflow_id}`"); + if let Some(reply_tx) = dataflow.spawn_result_tx.take() { + let _ = reply_tx.send(Err(err)); + } + } + }; + } + None => { + tracing::warn!("received DataflowSpawnResult, but no matching dataflow in `running_dataflows` map"); + } + }, } let elapsed = start.elapsed(); @@ -811,6 +845,9 @@ struct RunningDataflow { reply_senders: Vec>>, log_subscribers: Vec, + + pending_spawn_results: BTreeSet, + spawn_result_tx: Option>>, } struct ArchivedDataflow { @@ -1025,10 +1062,12 @@ async fn start_dataflow( BTreeSet::new() }, exited_before_subscribe: Default::default(), - daemons, + daemons: daemons.clone(), nodes, reply_senders: Vec::new(), log_subscribers: Vec::new(), + pending_spawn_results: daemons, + spawn_result_tx: None, }) } @@ -1103,6 +1142,11 @@ pub enum Event { DaemonExit { daemon_id: dora_message::common::DaemonId, }, + DataflowSpawnResult { + dataflow_id: uuid::Uuid, + daemon_id: DaemonId, + result: eyre::Result<()>, + }, } impl Event { diff --git a/binaries/coordinator/src/listener.rs b/binaries/coordinator/src/listener.rs index 6c666082..39e17bca 100644 --- a/binaries/coordinator/src/listener.rs +++ b/binaries/coordinator/src/listener.rs @@ -112,6 +112,19 @@ pub async fn handle_connection( break; } } + DaemonEvent::SpawnResult { + dataflow_id, + result, + } => { + let event = Event::DataflowSpawnResult { + dataflow_id, + daemon_id, + result: result.map_err(|err| eyre::eyre!(err)), + }; + if events_tx.send(event).await.is_err() { + break; + } + } }, }; } diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index f6f88e83..09c2b1a4 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -58,7 +58,7 @@ pub(super) async fn spawn_dataflow( daemons.insert(daemon_id); } - tracing::info!("successfully spawned dataflow `{uuid}`"); + tracing::info!("successfully triggered dataflow spawn `{uuid}`"); Ok(SpawnedDataflow { uuid, @@ -90,13 +90,14 @@ async fn spawn_dataflow_on_machine( tcp_send(&mut daemon_connection.stream, message) .await .wrap_err("failed to send spawn message to daemon")?; + let reply_raw = tcp_receive(&mut daemon_connection.stream) .await .wrap_err("failed to receive spawn reply from daemon")?; match serde_json::from_slice(&reply_raw) .wrap_err("failed to deserialize spawn reply from daemon")? { - DaemonCoordinatorReply::SpawnResult(result) => result + DaemonCoordinatorReply::TriggerSpawnResult(result) => result .map_err(|e| eyre!(e)) .wrap_err("daemon returned an error")?, _ => bail!("unexpected reply"), diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 1e4ca636..2046384c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -203,7 +203,7 @@ impl Daemon { .map_err(|err| eyre!("failed to receive spawn result: {err}")) .and_then(|r| async { match r { - Some(DaemonCoordinatorReply::SpawnResult(result)) => { + Some(DaemonCoordinatorReply::TriggerSpawnResult(result)) => { result.map_err(|err| eyre!(err)) } _ => Err(eyre!("unexpected spawn reply")), @@ -408,6 +408,26 @@ impl Daemon { self.handle_node_stop(dataflow_id, &node_id).await?; } }, + Event::SpawnDataflowResult { + dataflow_id, + result, + } => { + if let Some(connection) = &mut self.coordinator_connection { + let msg = serde_json::to_vec(&Timestamped { + inner: CoordinatorRequest::Event { + daemon_id: self.daemon_id.clone(), + event: DaemonEvent::SpawnResult { + dataflow_id, + result: result.map_err(|err| format!("{err:?}")), + }, + }, + timestamp: self.clock.new_timestamp(), + })?; + socket_stream_send(connection, &msg) + .await + .wrap_err("failed to send Exit message to dora-coordinator")?; + } + } } let elapsed = start.elapsed(); @@ -470,21 +490,37 @@ impl Daemon { uv, ) .await; - if let Err(err) = &result { - tracing::error!("{err:?}"); - } - tokio::spawn(async move { - let result = match result { - Err(err) => Err(err), - Ok(task) => task.await, - }; - let reply = DaemonCoordinatorReply::SpawnResult( - result.map_err(|err| format!("{err:?}")), - ); - let _ = reply_tx.send(Some(reply)).map_err(|_| { - error!("could not send `SpawnResult` reply from daemon to coordinator") - }); + let (trigger_result, result_task) = match result { + Ok(result_task) => (Ok(()), Some(result_task)), + Err(err) => (Err(format!("{err:?}")), None), + }; + let reply = DaemonCoordinatorReply::TriggerSpawnResult(trigger_result); + let _ = reply_tx.send(Some(reply)).map_err(|_| { + error!("could not send `TriggerSpawnResult` reply from daemon to coordinator") }); + + let result_tx = self.events_tx.clone(); + let clock = self.clock.clone(); + if let Some(result_task) = result_task { + tokio::spawn(async move { + let message = Timestamped { + inner: Event::SpawnDataflowResult { + dataflow_id, + result: result_task.await, + }, + timestamp: clock.new_timestamp(), + }; + let _ = result_tx + .send(message) + .map_err(|_| { + error!( + "could not send `SpawnResult` reply from daemon to coordinator" + ) + }) + .await; + }); + } + RunStatus::Continue } DaemonCoordinatorEvent::AllNodesReady { @@ -2104,6 +2140,10 @@ pub enum Event { node_id: NodeId, result: Result, }, + SpawnDataflowResult { + dataflow_id: Uuid, + result: eyre::Result<()>, + }, } impl From for Event { diff --git a/libraries/message/src/daemon_to_coordinator.rs b/libraries/message/src/daemon_to_coordinator.rs index 22bd0e5f..309697be 100644 --- a/libraries/message/src/daemon_to_coordinator.rs +++ b/libraries/message/src/daemon_to_coordinator.rs @@ -46,6 +46,10 @@ impl DaemonRegisterRequest { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum DaemonEvent { + SpawnResult { + dataflow_id: DataflowId, + result: Result<(), String>, + }, AllNodesReady { dataflow_id: DataflowId, exited_before_subscribe: Vec, @@ -73,7 +77,7 @@ impl DataflowDaemonResult { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorReply { - SpawnResult(Result<(), String>), + TriggerSpawnResult(Result<(), String>), ReloadResult(Result<(), String>), StopResult(Result<(), String>), DestroyResult { From 740ca7494ab631a0c6df70e1a838d1196ec6ac49 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 16:47:27 +0200 Subject: [PATCH 027/101] Remove unused function --- binaries/coordinator/src/lib.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 772f3945..74321f24 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -157,10 +157,6 @@ impl DaemonConnections { self.daemons.keys() } - fn iter(&self) -> impl Iterator { - self.daemons.iter() - } - fn iter_mut(&mut self) -> impl Iterator { self.daemons.iter_mut() } From 0786b63f8f2d78d6950a666871ac3f23c21087a2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 17:09:36 +0200 Subject: [PATCH 028/101] Restore original sleep durations in CLI test --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c819c9f..18f8f008 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -359,7 +359,7 @@ jobs: dora build dataflow.yml --uv echo "Running CI Python Test" dora start dataflow.yml --name ci-python-test --detach --uv - sleep 60 + sleep 10 echo "Running dora stop" dora stop --name ci-python-test --grace-duration 5s dora destroy @@ -374,7 +374,7 @@ jobs: uv pip install -e apis/python/node dora build examples/python-dataflow/dataflow.yml --uv dora start examples/python-dataflow/dataflow.yml --name ci-python --detach --uv - sleep 60 + sleep 10 echo "Running dora stop" dora stop --name ci-python --grace-duration 30s @@ -383,14 +383,14 @@ jobs: dora build examples/python-dataflow/dataflow_dynamic.yml --uv dora start examples/python-dataflow/dataflow_dynamic.yml --name ci-python-dynamic --detach --uv uv run opencv-plot --name plot - sleep 240 + sleep 10 echo "Running dora stop" dora stop --name ci-python-dynamic --grace-duration 30s # Run Python Operator Example echo "Running CI Operator Test" dora start examples/python-operator-dataflow/dataflow.yml --name ci-python-operator --detach --uv - sleep 120 + sleep 10 echo "Running dora stop" dora stop --name ci-python-operator --grace-duration 30s From d7f30c362d692454807e6f1b555100b785b627b5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 16 Apr 2025 17:56:31 +0200 Subject: [PATCH 029/101] Disable sccache for `musllinux` jobs --- .github/workflows/pip-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pip-release.yml b/.github/workflows/pip-release.yml index f5ef5983..711698fb 100644 --- a/.github/workflows/pip-release.yml +++ b/.github/workflows/pip-release.yml @@ -101,7 +101,7 @@ jobs: with: target: ${{ matrix.platform.target }} args: --release --out dist - sccache: "true" + sccache: "false" manylinux: musllinux_1_2 working-directory: ${{ matrix.repository.path }} From 20a1ade6add4d22ea7c9324aca87ea96651f9f91 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 23 Apr 2025 17:30:40 +0200 Subject: [PATCH 030/101] Only use event kind in latency warning (instead of debug output) Avoid performance overhead. Add comments on why we check the handling time. --- binaries/coordinator/src/lib.rs | 22 ++++++++++++++++++++-- binaries/daemon/src/lib.rs | 24 ++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 74321f24..2bb2a9fc 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -197,8 +197,9 @@ async fn start_inner( let mut daemon_connections = DaemonConnections::default(); while let Some(event) = events.next().await { + // used below for measuring the event handling duration let start = Instant::now(); - let event_debug = format!("{event:?}"); + let event_kind = event.kind(); if event.log() { tracing::trace!("Handling event {event:?}"); @@ -738,10 +739,11 @@ async fn start_inner( }, } + // warn if event handling took too long -> the main loop should never be blocked for too long let elapsed = start.elapsed(); if elapsed > Duration::from_millis(100) { tracing::warn!( - "Coordinator took {}ms for handling event: {event_debug}", + "Coordinator took {}ms for handling event: {event_kind}", elapsed.as_millis() ); } @@ -1154,6 +1156,22 @@ impl Event { _ => true, } } + + fn kind(&self) -> &'static str { + match self { + Event::NewDaemonConnection(_) => "NewDaemonConnection", + Event::DaemonConnectError(_) => "DaemonConnectError", + Event::DaemonHeartbeat { .. } => "DaemonHeartbeat", + Event::Dataflow { .. } => "Dataflow", + Event::Control(_) => "Control", + Event::Daemon(_) => "Daemon", + Event::DaemonHeartbeatInterval => "DaemonHeartbeatInterval", + Event::CtrlC => "CtrlC", + Event::Log(_) => "Log", + Event::DaemonExit { .. } => "DaemonExit", + Event::DataflowSpawnResult { .. } => "DataflowSpawnResult", + } + } } #[derive(Debug)] diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 2046384c..a79a344c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -320,8 +320,9 @@ impl Daemon { tracing::warn!("failed to update HLC with incoming event timestamp: {err}"); } + // used below for checking the duration of event handling let start = Instant::now(); - let event_debug = format!("{inner:?}"); + let event_kind = inner.kind(); match inner { Event::Coordinator(CoordinatorEvent { event, reply_tx }) => { @@ -430,10 +431,11 @@ impl Daemon { } } + // warn if event handling took too long -> the main loop should never be blocked for too long let elapsed = start.elapsed(); if elapsed > Duration::from_millis(100) { tracing::warn!( - "Daemon took {}ms for handling event: {event_debug}", + "Daemon took {}ms for handling event: {event_kind}", elapsed.as_millis() ); } @@ -2152,6 +2154,24 @@ impl From for Event { } } +impl Event { + pub fn kind(&self) -> &'static str { + match self { + Event::Node { .. } => "Node", + Event::Coordinator(_) => "Coordinator", + Event::Daemon(_) => "Daemon", + Event::Dora(_) => "Dora", + Event::DynamicNode(_) => "DynamicNode", + Event::HeartbeatInterval => "HeartbeatInterval", + Event::CtrlC => "CtrlC", + Event::SecondCtrlC => "SecondCtrlC", + Event::DaemonError(_) => "DaemonError", + Event::SpawnNodeResult { .. } => "SpawnNodeResult", + Event::SpawnDataflowResult { .. } => "SpawnDataflowResult", + } + } +} + #[derive(Debug)] pub enum DaemonNodeEvent { OutputsDone { From 3d3271ad098092da8ea977956e5a810112957270 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 23 Apr 2025 17:57:01 +0200 Subject: [PATCH 031/101] Fix: Don't split log messages across multiple log events --- binaries/daemon/src/log.rs | 42 ++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index ef45452c..2aee36f6 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -181,27 +181,32 @@ impl Logger { match message.level { LogLevel::Error => { if let Some(node_id) = message.node_id { - tracing::error!("{}/{} errored:", message.dataflow_id.to_string(), node_id); - } - for line in message.message.lines() { - tracing::error!(" {}", line); + tracing::error!( + "{}/{} errored:\n{}", + message.dataflow_id.to_string(), + node_id, + Indent(&message.message) + ); } } LogLevel::Warn => { if let Some(node_id) = message.node_id { - tracing::warn!("{}/{} warned:", message.dataflow_id.to_string(), node_id); - } - for line in message.message.lines() { - tracing::warn!(" {}", line); + tracing::warn!( + "{}/{} warned:\n{}", + message.dataflow_id.to_string(), + node_id, + Indent(&message.message) + ); } } LogLevel::Info => { if let Some(node_id) = message.node_id { - tracing::info!("{}/{} info:", message.dataflow_id.to_string(), node_id); - } - - for line in message.message.lines() { - tracing::info!(" {}", line); + tracing::info!( + "{}/{} info:\n{}", + message.dataflow_id.to_string(), + node_id, + Indent(&message.message) + ); } } _ => {} @@ -254,3 +259,14 @@ impl DerefMut for CowMut<'_, T> { } } } + +struct Indent<'a>(&'a str); + +impl std::fmt::Display for Indent<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for line in self.0.lines() { + write!(f, " {}", line)?; + } + Ok(()) + } +} From bcb861932c1461f306268ba55c1bab943885a882 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 23 Apr 2025 19:04:28 +0200 Subject: [PATCH 032/101] Make `dora build` command behave like `dora start` without spawning Instead of running all the build commands directly, run them on their intended target machines through the coordinator. This commit is a breaking change because a coordinator connection is now required for `dora build`. --- binaries/cli/src/build.rs | 67 --------- binaries/cli/src/lib.rs | 133 +++++++++++------- binaries/coordinator/src/lib.rs | 28 +++- binaries/coordinator/src/run/mod.rs | 17 ++- binaries/daemon/src/lib.rs | 5 + binaries/daemon/src/spawn.rs | 15 +- examples/multiple-daemons/run.rs | 1 + libraries/message/src/cli_to_coordinator.rs | 1 + .../message/src/coordinator_to_daemon.rs | 1 + 9 files changed, 145 insertions(+), 123 deletions(-) delete mode 100644 binaries/cli/src/build.rs diff --git a/binaries/cli/src/build.rs b/binaries/cli/src/build.rs deleted file mode 100644 index 7783615b..00000000 --- a/binaries/cli/src/build.rs +++ /dev/null @@ -1,67 +0,0 @@ -use dora_core::{ - build::run_build_command, - config::OperatorId, - descriptor::{Descriptor, DescriptorExt, NodeExt, SINGLE_OPERATOR_DEFAULT_ID}, -}; -use eyre::Context; - -use crate::resolve_dataflow; - -pub fn build(dataflow: String, uv: bool) -> eyre::Result<()> { - let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; - let descriptor = Descriptor::blocking_read(&dataflow)?; - let dataflow_absolute = if dataflow.is_relative() { - std::env::current_dir().unwrap().join(dataflow) - } else { - dataflow.to_owned() - }; - let working_dir = dataflow_absolute.parent().unwrap(); - - let default_op_id = OperatorId::from(SINGLE_OPERATOR_DEFAULT_ID.to_string()); - - for node in descriptor.nodes { - match node.kind()? { - dora_core::descriptor::NodeKind::Standard(_) => { - if let Some(build) = &node.build { - run_build_command(build, working_dir, uv, &node.env).with_context(|| { - format!("build command failed for standard node `{}`", node.id) - })? - } - } - dora_core::descriptor::NodeKind::Runtime(runtime_node) => { - for operator in &runtime_node.operators { - if let Some(build) = &operator.config.build { - run_build_command(build, working_dir, uv, &node.env).with_context( - || { - format!( - "build command failed for operator `{}/{}`", - node.id, operator.id - ) - }, - )?; - } - } - } - dora_core::descriptor::NodeKind::Custom(custom_node) => { - if let Some(build) = &custom_node.build { - run_build_command(build, working_dir, uv, &node.env).with_context(|| { - format!("build command failed for custom node `{}`", node.id) - })? - } - } - dora_core::descriptor::NodeKind::Operator(operator) => { - if let Some(build) = &operator.config.build { - run_build_command(build, working_dir, uv, &node.env).with_context(|| { - format!( - "build command failed for operator `{}/{}`", - node.id, - operator.id.as_ref().unwrap_or(&default_op_id) - ) - })? - } - } - } - } - - Ok(()) -} diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index 1357fa78..2b667b0d 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -33,7 +33,6 @@ use tracing::level_filters::LevelFilter; use uuid::Uuid; mod attach; -pub(crate) mod build; mod check; mod formatting; mod graph; @@ -83,6 +82,12 @@ enum Command { /// Path to the dataflow descriptor file #[clap(value_name = "PATH")] dataflow: String, + /// Address of the dora coordinator + #[clap(long, value_name = "IP", default_value_t = LOCALHOST)] + coordinator_addr: IpAddr, + /// Port number of the coordinator control server + #[clap(long, value_name = "PORT", default_value_t = DORA_COORDINATOR_PORT_CONTROL_DEFAULT)] + coordinator_port: u16, // Use UV to build nodes. #[clap(long, action)] uv: bool, @@ -366,8 +371,13 @@ fn run(args: Args) -> eyre::Result<()> { } => { graph::create(dataflow, mermaid, open)?; } - Command::Build { dataflow, uv } => { - build::build(dataflow, uv)?; + Command::Build { + dataflow, + coordinator_addr, + coordinator_port, + uv, + } => { + start_dataflow(dataflow, None, coordinator_addr, coordinator_port, uv, true)?; } Command::New { args, @@ -419,26 +429,15 @@ fn run(args: Args) -> eyre::Result<()> { hot_reload, uv, } => { - let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; - let dataflow_descriptor = - Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; - let working_dir = dataflow - .canonicalize() - .context("failed to canonicalize dataflow path")? - .parent() - .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? - .to_owned(); - - let coordinator_socket = (coordinator_addr, coordinator_port).into(); - let mut session = connect_to_coordinator(coordinator_socket) - .wrap_err("failed to connect to dora coordinator")?; - let dataflow_id = start_dataflow( - dataflow_descriptor.clone(), - name, - working_dir, - &mut *session, - uv, - )?; + let (dataflow, dataflow_descriptor, coordinator_socket, mut session, dataflow_id) = + start_dataflow( + dataflow, + name, + coordinator_addr, + coordinator_port, + uv, + false, + )?; let attach = match (attach, detach) { (true, true) => eyre::bail!("both `--attach` and `--detach` are given"), @@ -613,34 +612,72 @@ fn run(args: Args) -> eyre::Result<()> { } fn start_dataflow( - dataflow: Descriptor, + dataflow: String, name: Option, - local_working_dir: PathBuf, - session: &mut TcpRequestReplyConnection, + coordinator_addr: IpAddr, + coordinator_port: u16, uv: bool, -) -> Result { - let reply_raw = session - .request( - &serde_json::to_vec(&ControlRequest::Start { - dataflow, - name, - local_working_dir, - uv, - }) - .unwrap(), - ) - .wrap_err("failed to send start dataflow message")?; - - let result: ControlRequestReply = - serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; - match result { - ControlRequestReply::DataflowStarted { uuid } => { - eprintln!("{uuid}"); - Ok(uuid) + build_only: bool, +) -> Result< + ( + PathBuf, + Descriptor, + SocketAddr, + Box, + Uuid, + ), + eyre::Error, +> { + let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; + let dataflow_descriptor = + Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; + let working_dir = dataflow + .canonicalize() + .context("failed to canonicalize dataflow path")? + .parent() + .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? + .to_owned(); + let coordinator_socket = (coordinator_addr, coordinator_port).into(); + let mut session = connect_to_coordinator(coordinator_socket) + .wrap_err("failed to connect to dora coordinator")?; + let dataflow_id = { + let dataflow = dataflow_descriptor.clone(); + let session: &mut TcpRequestReplyConnection = &mut *session; + let reply_raw = session + .request( + &serde_json::to_vec(&ControlRequest::Start { + dataflow, + name, + local_working_dir: working_dir, + uv, + build_only, + }) + .unwrap(), + ) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowStarted { uuid } => { + if build_only { + eprintln!("dataflow build successful"); + } else { + eprintln!("{uuid}"); + } + uuid + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), } - ControlRequestReply::Error(err) => bail!("{err}"), - other => bail!("unexpected start dataflow reply: {other:?}"), - } + }; + Ok(( + dataflow, + dataflow_descriptor, + coordinator_socket, + session, + dataflow_id, + )) } fn stop_dataflow_interactive( diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 2bb2a9fc..422043ca 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -394,6 +394,7 @@ async fn start_inner( name, local_working_dir, uv, + build_only, } => { let name = name.or_else(|| names::Generator::default().next()); @@ -414,6 +415,7 @@ async fn start_inner( &mut daemon_connections, &clock, uv, + build_only, ) .await?; Ok(dataflow) @@ -716,13 +718,23 @@ async fn start_inner( match result { Ok(()) => { if dataflow.pending_spawn_results.is_empty() { - tracing::info!("successfully spawned dataflow `{dataflow_id}`"); + tracing::info!( + "successfully {} dataflow `{dataflow_id}`", + if dataflow.build_only { + "built" + } else { + "spawned" + } + ); if let Some(reply_tx) = dataflow.spawn_result_tx.take() { let _ = reply_tx.send(Ok(ControlRequestReply::DataflowStarted { uuid: dataflow_id, })); } + if dataflow.build_only { + running_dataflows.remove(&dataflow_id); + } } } Err(err) => { @@ -846,6 +858,8 @@ struct RunningDataflow { pending_spawn_results: BTreeSet, spawn_result_tx: Option>>, + + build_only: bool, } struct ArchivedDataflow { @@ -1045,12 +1059,21 @@ async fn start_dataflow( daemon_connections: &mut DaemonConnections, clock: &HLC, uv: bool, + build_only: bool, ) -> eyre::Result { let SpawnedDataflow { uuid, daemons, nodes, - } = spawn_dataflow(dataflow, working_dir, daemon_connections, clock, uv).await?; + } = spawn_dataflow( + dataflow, + working_dir, + daemon_connections, + clock, + uv, + build_only, + ) + .await?; Ok(RunningDataflow { uuid, name, @@ -1066,6 +1089,7 @@ async fn start_dataflow( log_subscribers: Vec::new(), pending_spawn_results: daemons, spawn_result_tx: None, + build_only, }) } diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 09c2b1a4..425f0213 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -26,6 +26,7 @@ pub(super) async fn spawn_dataflow( daemon_connections: &mut DaemonConnections, clock: &HLC, uv: bool, + build_only: bool, ) -> eyre::Result { let nodes = dataflow.resolve_aliases_and_set_defaults()?; let uuid = Uuid::new_v7(Timestamp::now(NoContext)); @@ -36,7 +37,8 @@ pub(super) async fn spawn_dataflow( for (machine, nodes_on_machine) in &nodes_by_daemon { let spawn_nodes = nodes_on_machine.iter().map(|n| n.id.clone()).collect(); tracing::debug!( - "Spawning dataflow `{uuid}` on machine `{machine:?}` (nodes: {spawn_nodes:?})" + "{} dataflow `{uuid}` on machine `{machine:?}` (nodes: {spawn_nodes:?})", + if build_only { "Building" } else { "Spawning" } ); let spawn_command = SpawnDataflowNodes { @@ -46,6 +48,7 @@ pub(super) async fn spawn_dataflow( dataflow_descriptor: dataflow.clone(), spawn_nodes, uv, + build_only, }; let message = serde_json::to_vec(&Timestamped { inner: DaemonCoordinatorEvent::Spawn(spawn_command), @@ -54,11 +57,19 @@ pub(super) async fn spawn_dataflow( let daemon_id = spawn_dataflow_on_machine(daemon_connections, machine.as_deref(), &message) .await - .wrap_err_with(|| format!("failed to spawn dataflow on machine `{machine:?}`"))?; + .wrap_err_with(|| { + format!( + "failed to {} dataflow on machine `{machine:?}`", + if build_only { "build" } else { "spawn" } + ) + })?; daemons.insert(daemon_id); } - tracing::info!("successfully triggered dataflow spawn `{uuid}`"); + tracing::info!( + "successfully triggered dataflow {} `{uuid}`", + if build_only { "build" } else { "spawn" } + ); Ok(SpawnedDataflow { uuid, diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index a79a344c..cc618d5f 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -167,6 +167,7 @@ impl Daemon { nodes, dataflow_descriptor: descriptor, uv, + build_only: false, }; let clock = Arc::new(HLC::default()); @@ -470,6 +471,7 @@ impl Daemon { dataflow_descriptor, spawn_nodes, uv, + build_only, }) => { match dataflow_descriptor.communication.remote { dora_core::config::RemoteCommunicationConfig::Tcp => {} @@ -490,6 +492,7 @@ impl Daemon { dataflow_descriptor, spawn_nodes, uv, + build_only, ) .await; let (trigger_result, result_task) = match result { @@ -762,6 +765,7 @@ impl Daemon { dataflow_descriptor: Descriptor, spawn_nodes: BTreeSet, uv: bool, + build_only: bool, ) -> eyre::Result>> { let mut logger = self.logger.for_dataflow(dataflow_id); let dataflow = @@ -821,6 +825,7 @@ impl Daemon { dataflow_descriptor, clock: self.clock.clone(), uv, + build_only, }; let mut tasks = Vec::new(); diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 3927eb3c..b066975d 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -56,6 +56,7 @@ pub struct Spawner { /// clock is required for generating timestamps when dropping messages early because queue is full pub clock: Arc, pub uv: bool, + pub build_only: bool, } impl Spawner { @@ -148,8 +149,12 @@ impl Spawner { self.build_node(logger, &node.env, self.working_dir.clone(), build) .await?; } - spawn_command_from_path(&self.working_dir, self.uv, logger, &n, true) - .await? + if self.build_only { + None + } else { + spawn_command_from_path(&self.working_dir, self.uv, logger, &n, true) + .await? + } } dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { self.spawn_git_node(&n, repo, rev, logger, &node.env, prepared_git.unwrap()) @@ -680,7 +685,11 @@ impl Spawner { self.build_node(logger, node_env, clone_dir.clone(), build) .await?; } - spawn_command_from_path(&clone_dir, self.uv, logger, node, true).await + if self.build_only { + Ok(None) + } else { + spawn_command_from_path(&clone_dir, self.uv, logger, node, true).await + } } async fn build_node( diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index 64410a8a..17b4765f 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -143,6 +143,7 @@ async fn start_dataflow( local_working_dir: working_dir, name: None, uv: false, + build_only: false, }, reply_sender, })) diff --git a/libraries/message/src/cli_to_coordinator.rs b/libraries/message/src/cli_to_coordinator.rs index 1b62fd58..ab91f449 100644 --- a/libraries/message/src/cli_to_coordinator.rs +++ b/libraries/message/src/cli_to_coordinator.rs @@ -16,6 +16,7 @@ pub enum ControlRequest { // binaries from CLI to coordinator/daemon local_working_dir: PathBuf, uv: bool, + build_only: bool, }, Reload { dataflow_id: Uuid, diff --git a/libraries/message/src/coordinator_to_daemon.rs b/libraries/message/src/coordinator_to_daemon.rs index 482f0042..8c68a6ca 100644 --- a/libraries/message/src/coordinator_to_daemon.rs +++ b/libraries/message/src/coordinator_to_daemon.rs @@ -63,4 +63,5 @@ pub struct SpawnDataflowNodes { pub dataflow_descriptor: Descriptor, pub spawn_nodes: BTreeSet, pub uv: bool, + pub build_only: bool, } From d92d51f6713a4c9f47dc370a6f1a085bad4c2c2a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 11:33:35 +0200 Subject: [PATCH 033/101] CI: Remove some `dora build` calls that were run before `dora up` The `dora build` command now requires a connection to the coordinator. --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 18f8f008..cb96f93f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -344,7 +344,6 @@ jobs: cd test_python_project uv venv --seed -p 3.11 uv pip install -e ../apis/python/node - dora build dataflow.yml --uv uv pip install ruff pytest # Check Compliancy @@ -407,7 +406,6 @@ jobs: # Run Rust queue latency test echo "Running CI Queue Size Latest Data Rust Test" - dora build tests/queue_size_latest_data_rust/dataflow.yaml --uv dora run tests/queue_size_latest_data_rust/dataflow.yaml --uv - name: "Test CLI (C)" From ead79013c81370463dbc596cd85da8978850ceb3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 12:02:42 +0200 Subject: [PATCH 034/101] Skip path checks for nodes with build command --- libraries/core/src/descriptor/validate.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libraries/core/src/descriptor/validate.rs b/libraries/core/src/descriptor/validate.rs index 526fedff..871cfb99 100644 --- a/libraries/core/src/descriptor/validate.rs +++ b/libraries/core/src/descriptor/validate.rs @@ -43,6 +43,8 @@ pub fn check_dataflow( info!("skipping path check for remote node `{}`", node.id); } } + } else if custom.build.is_some() { + info!("skipping path check for node with build command"); } else { resolve_path(source, working_dir).wrap_err_with(|| { format!("Could not find source path `{}`", source) @@ -51,7 +53,7 @@ pub fn check_dataflow( } }, dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { - // TODO: implement git repo check + info!("skipping check for node with git source"); } }, descriptor::CoreNodeKind::Runtime(node) => { From 0fabf16ff0e2a00ea66d5e128e443a326a716e67 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 12:37:05 +0200 Subject: [PATCH 035/101] Remove superfluous extra build step in example `run.rs` scripts --- examples/benchmark/run.rs | 14 -------------- examples/multiple-daemons/run.rs | 13 ------------- examples/rust-dataflow-git/run.rs | 14 -------------- examples/rust-dataflow-url/run.rs | 13 ------------- examples/rust-dataflow/run.rs | 14 -------------- examples/rust-ros2-dataflow/run.rs | 13 ------------- 6 files changed, 81 deletions(-) diff --git a/examples/benchmark/run.rs b/examples/benchmark/run.rs index b6bed6fe..8e0076bc 100644 --- a/examples/benchmark/run.rs +++ b/examples/benchmark/run.rs @@ -11,26 +11,12 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); - build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } -async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--release"); - cmd.arg("--").arg("build").arg(dataflow); - if !cmd.status().await?.success() { - bail!("failed to build dataflow"); - }; - Ok(()) -} - async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index 17b4765f..e0b59486 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -36,7 +36,6 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); - build_dataflow(dataflow).await?; let (coordinator_events_tx, coordinator_events_rx) = mpsc::channel(1); let coordinator_bind = SocketAddr::new( @@ -211,18 +210,6 @@ async fn destroy(coordinator_events_tx: &Sender) -> eyre::Result<()> { } } -async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--").arg("build").arg(dataflow); - if !cmd.status().await?.success() { - bail!("failed to build dataflow"); - }; - Ok(()) -} - async fn run_daemon(coordinator: String, machine_id: &str) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/rust-dataflow-git/run.rs b/examples/rust-dataflow-git/run.rs index 213b65a0..6a6a8782 100644 --- a/examples/rust-dataflow-git/run.rs +++ b/examples/rust-dataflow-git/run.rs @@ -17,25 +17,11 @@ async fn main() -> eyre::Result<()> { Path::new("dataflow.yml") }; - build_dataflow(dataflow).await?; - run_dataflow(dataflow).await?; Ok(()) } -async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--").arg("build").arg(dataflow); - if !cmd.status().await?.success() { - bail!("failed to build dataflow"); - }; - Ok(()) -} - async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index 6f511970..e93a5d28 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -11,25 +11,12 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); - build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } -async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--").arg("build").arg(dataflow); - if !cmd.status().await?.success() { - bail!("failed to build dataflow"); - }; - Ok(()) -} - async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 213b65a0..6a6a8782 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -17,25 +17,11 @@ async fn main() -> eyre::Result<()> { Path::new("dataflow.yml") }; - build_dataflow(dataflow).await?; - run_dataflow(dataflow).await?; Ok(()) } -async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--").arg("build").arg(dataflow); - if !cmd.status().await?.success() { - bail!("failed to build dataflow"); - }; - Ok(()) -} - async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/rust-ros2-dataflow/run.rs b/examples/rust-ros2-dataflow/run.rs index a14dce48..b930a91b 100644 --- a/examples/rust-ros2-dataflow/run.rs +++ b/examples/rust-ros2-dataflow/run.rs @@ -11,25 +11,12 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); - build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } -async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { - let cargo = std::env::var("CARGO").unwrap(); - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--").arg("build").arg(dataflow); - if !cmd.status().await?.success() { - bail!("failed to build dataflow"); - }; - Ok(()) -} - async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); From 3b2ccec5ec7287e9b156925f44c5783b0c13dedc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 12:56:37 +0200 Subject: [PATCH 036/101] Fix error messages (actually include node ID instead of printing `'{node_id}'` --- binaries/daemon/src/lib.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index cc618d5f..f40c3d15 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -1002,7 +1002,7 @@ impl Daemon { let node_config = match number_node_id { 2.. => Err(format!( - "multiple dataflows contains dynamic node id {node_id}. \ + "multiple dataflows contain dynamic node id {node_id}. \ Please only have one running dataflow with the specified \ node id if you want to use dynamic node", )), @@ -1014,7 +1014,9 @@ impl Daemon { let node_config = dataflow .running_nodes .get(&node_id) - .context("no node with ID `{node_id}` within the given dataflow")? + .with_context(|| { + format!("no node with ID `{node_id}` within the given dataflow") + })? .node_config .clone(); if !node_config.dynamic { @@ -1030,7 +1032,7 @@ impl Daemon { "failed to get dynamic node config within given dataflow: {err}" ) }), - 0 => Err("no node with ID `{node_id}`".to_string()), + 0 => Err(format!("no node with ID `{node_id}`")), }; let reply = DaemonReply::NodeConfig { From a13a297758580c0331111da9a652718d2db80fa5 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 12:57:03 +0200 Subject: [PATCH 037/101] Fix: Remove dataflow from daemon running list if it was `build_only` --- binaries/daemon/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index f40c3d15..582a018c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -413,7 +413,11 @@ impl Daemon { Event::SpawnDataflowResult { dataflow_id, result, + build_only, } => { + if build_only { + self.running.remove(&dataflow_id); + } if let Some(connection) = &mut self.coordinator_connection { let msg = serde_json::to_vec(&Timestamped { inner: CoordinatorRequest::Event { @@ -512,6 +516,7 @@ impl Daemon { inner: Event::SpawnDataflowResult { dataflow_id, result: result_task.await, + build_only, }, timestamp: clock.new_timestamp(), }; @@ -2152,6 +2157,7 @@ pub enum Event { SpawnDataflowResult { dataflow_id: Uuid, result: eyre::Result<()>, + build_only: bool, }, } From d9d23a4a76e963be7911b9ed4e6f633b22b75bbc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 13:52:25 +0200 Subject: [PATCH 038/101] Improve log printing for multiple daemons example --- examples/multiple-daemons/run.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index e0b59486..a042253b 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -8,7 +8,7 @@ use dora_message::{ common::DaemonId, coordinator_to_cli::{ControlRequestReply, DataflowIdAndName}, }; -use dora_tracing::set_up_tracing; +use dora_tracing::set_up_tracing_opts; use eyre::{bail, Context}; use std::{ @@ -29,7 +29,8 @@ use uuid::Uuid; #[tokio::main] async fn main() -> eyre::Result<()> { - set_up_tracing("multiple-daemon-runner").wrap_err("failed to set up tracing subscriber")?; + set_up_tracing_opts("multiple-daemon-runner", Some("debug"), None) + .wrap_err("failed to set up tracing subscriber")?; let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) @@ -46,12 +47,15 @@ async fn main() -> eyre::Result<()> { IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), DORA_COORDINATOR_PORT_CONTROL_DEFAULT, ); - let (_coordinator_port, coordinator) = dora_coordinator::start( + let (coordinator_port, coordinator) = dora_coordinator::start( coordinator_bind, coordinator_control_bind, ReceiverStream::new(coordinator_events_rx), ) .await?; + + tracing::info!("coordinator running on {coordinator_port}"); + let coordinator_addr = Ipv4Addr::LOCALHOST; let daemon_a = run_daemon(coordinator_addr.to_string(), "A"); let daemon_b = run_daemon(coordinator_addr.to_string(), "B"); From 3b00466e3bfdc0a08af906f888ff59721fd5f862 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 13:52:37 +0200 Subject: [PATCH 039/101] Skip path check for operators with build commands --- libraries/core/src/descriptor/validate.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libraries/core/src/descriptor/validate.rs b/libraries/core/src/descriptor/validate.rs index 871cfb99..d0c1d79a 100644 --- a/libraries/core/src/descriptor/validate.rs +++ b/libraries/core/src/descriptor/validate.rs @@ -62,6 +62,8 @@ pub fn check_dataflow( OperatorSource::SharedLibrary(path) => { if source_is_url(path) { info!("{path} is a URL."); // TODO: Implement url check. + } else if operator_definition.config.build.is_some() { + info!("skipping path check for operator with build command"); } else { let path = adjust_shared_library_path(Path::new(&path))?; if !working_dir.join(&path).exists() { From c82e6e450e97d61c2f540315bab8ab15eeed54b4 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 13:55:08 +0200 Subject: [PATCH 040/101] Remove build step for python-ros2-dataflow example The build step now requires a coordinator connection. --- examples/python-ros2-dataflow/run.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/python-ros2-dataflow/run.rs b/examples/python-ros2-dataflow/run.rs index 23b254e2..2873426e 100644 --- a/examples/python-ros2-dataflow/run.rs +++ b/examples/python-ros2-dataflow/run.rs @@ -40,15 +40,6 @@ async fn main() -> eyre::Result<()> { async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); - // First build the dataflow (install requirements) - let mut cmd = tokio::process::Command::new(&cargo); - cmd.arg("run"); - cmd.arg("--package").arg("dora-cli"); - cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); - if !cmd.status().await?.success() { - bail!("failed to run dataflow"); - }; - let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); From 2279258a6e38d514f687b55651030a6aa9f9b75c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 13:57:25 +0200 Subject: [PATCH 041/101] Fix: Run `dora build` before running pytest --- .github/workflows/ci.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb96f93f..8c8959f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -346,13 +346,16 @@ jobs: uv pip install -e ../apis/python/node uv pip install ruff pytest + echo "Running dora up" + dora up + echo "Running dora build" + dora build dataflow.yml --uv + # Check Compliancy uv run ruff check . uv run pytest export OPERATING_MODE=SAVE - echo "Running dora up" - dora up echo "Running dora list" dora list dora build dataflow.yml --uv From 80c7231fb983826284ad0c6112eb1614847fd423 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 16:28:44 +0200 Subject: [PATCH 042/101] Tweak logging --- binaries/daemon/src/log.rs | 66 ++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index 2aee36f6..26488361 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -180,34 +180,52 @@ impl Logger { // log message using tracing if reporting to coordinator is not possible match message.level { LogLevel::Error => { - if let Some(node_id) = message.node_id { - tracing::error!( - "{}/{} errored:\n{}", - message.dataflow_id.to_string(), - node_id, - Indent(&message.message) - ); - } + tracing::error!( + dataflow_id = message.dataflow_id.to_string(), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); } LogLevel::Warn => { - if let Some(node_id) = message.node_id { - tracing::warn!( - "{}/{} warned:\n{}", - message.dataflow_id.to_string(), - node_id, - Indent(&message.message) - ); - } + tracing::warn!( + dataflow_id = message.dataflow_id.to_string(), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); } LogLevel::Info => { - if let Some(node_id) = message.node_id { - tracing::info!( - "{}/{} info:\n{}", - message.dataflow_id.to_string(), - node_id, - Indent(&message.message) - ); - } + tracing::info!( + dataflow_id = message.dataflow_id.to_string(), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Debug => { + tracing::debug!( + dataflow_id = message.dataflow_id.to_string(), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); } _ => {} } From 17f75c04d588e5888f840c455eecaffb1df257d8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 25 Apr 2025 16:29:07 +0200 Subject: [PATCH 043/101] Wait until all nodes on daemon have been built before spawning --- binaries/daemon/src/lib.rs | 161 +++++++--- binaries/daemon/src/spawn.rs | 599 +++++++++++++++++++---------------- 2 files changed, 452 insertions(+), 308 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 582a018c..7abe7ec5 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -772,7 +772,12 @@ impl Daemon { uv: bool, build_only: bool, ) -> eyre::Result>> { - let mut logger = self.logger.for_dataflow(dataflow_id); + let mut logger = self + .logger + .for_dataflow(dataflow_id) + .try_clone() + .await + .context("failed to clone logger")?; let dataflow = RunningDataflow::new(dataflow_id, self.daemon_id.clone(), &dataflow_descriptor); let dataflow = match self.running.entry(dataflow_id) { @@ -857,7 +862,7 @@ impl Daemon { .await; match spawner .clone() - .spawn_node( + .prepare_node( node, node_stderr_most_recent, &mut logger, @@ -867,39 +872,7 @@ impl Daemon { .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { Ok(result) => { - let events_tx = self.events_tx.clone(); - let clock = self.clock.clone(); - tasks.push(async move { - let (node_spawn_result, success) = match result.await { - Ok(node) => (Ok(node), Ok(())), - Err(err) => { - let node_err = NodeError { - timestamp: clock.new_timestamp(), - cause: NodeErrorCause::Other { - stderr: format!("spawn failed: {err:?}"), - }, - exit_status: NodeExitStatus::Unknown, - }; - (Err(node_err), Err(err)) - } - }; - let send_result = events_tx - .send(Timestamped { - inner: Event::SpawnNodeResult { - dataflow_id, - node_id, - result: node_spawn_result, - }, - timestamp: clock.new_timestamp(), - }) - .await; - if send_result.is_err() { - tracing::error!( - "failed to send SpawnNodeResult to main daemon task" - ) - } - success - }); + tasks.push((node_id, result)); } Err(err) => { logger @@ -980,16 +953,122 @@ impl Daemon { self.handle_node_stop(dataflow_id, &node_id).await?; } - let spawn_result = async move { - for task in tasks { - task.await?; - } - Ok(()) - }; + let spawn_result = Self::spawn_prepared_nodes( + dataflow_id, + logger, + tasks, + self.events_tx.clone(), + self.clock.clone(), + ); Ok(spawn_result) } + async fn spawn_prepared_nodes( + dataflow_id: Uuid, + mut logger: DataflowLogger<'_>, + tasks: Vec<( + NodeId, + impl Future>, + )>, + events_tx: mpsc::Sender>, + clock: Arc, + ) -> eyre::Result<()> { + let node_result = |node_id, result| Timestamped { + inner: Event::SpawnNodeResult { + dataflow_id, + node_id, + result, + }, + timestamp: clock.new_timestamp(), + }; + let mut failed_to_prepare = None; + let mut prepared_nodes = Vec::new(); + for (node_id, task) in tasks { + match task.await { + Ok(node) => prepared_nodes.push(node), + Err(err) => { + if failed_to_prepare.is_none() { + failed_to_prepare = Some(node_id.clone()); + } + let node_err: NodeError = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::Other { + stderr: format!("preparing for spawn failed: {err:?}"), + }, + exit_status: NodeExitStatus::Unknown, + }; + let send_result = events_tx.send(node_result(node_id, Err(node_err))).await; + if send_result.is_err() { + tracing::error!("failed to send SpawnNodeResult to main daemon task") + } + } + } + } + + // once all nodes are prepared, do the actual spawning + if let Some(failed_node) = failed_to_prepare { + // don't spawn any nodes when an error occurred before + for node in prepared_nodes { + let err = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::Cascading { + caused_by_node: failed_node.clone(), + }, + exit_status: NodeExitStatus::Unknown, + }; + let send_result = events_tx + .send(node_result(node.node_id().clone(), Err(err))) + .await; + if send_result.is_err() { + tracing::error!("failed to send SpawnNodeResult to main daemon task") + } + } + Err(eyre!("failed to prepare node {failed_node}")) + } else { + let mut spawn_result = Ok(()); + + logger + .log( + LogLevel::Info, + None, + Some("dora daemon".into()), + "finished building nodes, spawning...", + ) + .await; + + // spawn the nodes + for node in prepared_nodes { + let node_id = node.node_id().clone(); + let mut logger = logger.reborrow().for_node(node_id.clone()); + let result = node.spawn(&mut logger).await; + let node_spawn_result = match result { + Ok(node) => Ok(node), + Err(err) => { + let node_err = NodeError { + timestamp: clock.new_timestamp(), + cause: NodeErrorCause::Other { + stderr: format!("spawn failed: {err:?}"), + }, + exit_status: NodeExitStatus::Unknown, + }; + if spawn_result.is_ok() { + spawn_result = Err(err.wrap_err(format!("failed to spawn {node_id}"))); + } + Err(node_err) + } + }; + let send_result = events_tx + .send(node_result(node_id, node_spawn_result)) + .await; + if send_result.is_err() { + tracing::error!("failed to send SpawnNodeResult to main daemon task") + } + } + spawn_result + } + } + async fn handle_dynamic_node_event( &mut self, event: DynamicNodeEventWrapper, diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index b066975d..f47233d8 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -22,6 +22,7 @@ use dora_message::{ daemon_to_coordinator::{DataMessage, NodeExitStatus, Timestamped}, daemon_to_node::{NodeConfig, RuntimeConfig}, descriptor::{EnvValue, GitRepoRev}, + id::NodeId, DataflowId, }; use dora_node_api::{ @@ -60,13 +61,13 @@ pub struct Spawner { } impl Spawner { - pub async fn spawn_node( + pub async fn prepare_node( mut self, node: ResolvedNode, node_stderr_most_recent: Arc>, logger: &mut NodeLogger<'_>, repos_in_use: &mut BTreeMap>, - ) -> eyre::Result>> { + ) -> eyre::Result>> { let dataflow_id = self.dataflow_id; let node_id = node.id.clone(); logger @@ -115,7 +116,7 @@ impl Spawner { .await .wrap_err("failed to clone logger")?; let task = async move { - self.spawn_node_inner( + self.prepare_node_inner( node, &mut logger, dataflow_id, @@ -128,22 +129,18 @@ impl Spawner { Ok(task) } - async fn spawn_node_inner( - &mut self, + async fn prepare_node_inner( + mut self, node: ResolvedNode, logger: &mut NodeLogger<'_>, dataflow_id: uuid::Uuid, node_config: NodeConfig, prepared_git: Option, node_stderr_most_recent: Arc>, - ) -> Result { - let send_stdout_to = node - .send_stdout_as() - .context("Could not resolve `send_stdout_as` configuration")?; - - let mut child = match node.kind { + ) -> eyre::Result { + let (command, error_msg) = match &node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { - let command = match &n.source { + let mut command = match &n.source { dora_message::descriptor::NodeSource::Local => { if let Some(build) = &n.build { self.build_node(logger, &node.env, self.working_dir.clone(), build) @@ -152,63 +149,71 @@ impl Spawner { if self.build_only { None } else { - spawn_command_from_path(&self.working_dir, self.uv, logger, &n, true) - .await? + path_spawn_command(&self.working_dir, self.uv, logger, &n, true).await? } } dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { - self.spawn_git_node(&n, repo, rev, logger, &node.env, prepared_git.unwrap()) - .await? + self.git_node_spawn_command( + &n, + repo, + rev, + logger, + &node.env, + prepared_git.unwrap(), + ) + .await? } }; - let Some(mut command) = command else { - return Ok(RunningNode { - pid: None, - node_config, - }); - }; + if let Some(command) = &mut command { + command.current_dir(&self.working_dir); + command.stdin(Stdio::null()); + + command.env( + "DORA_NODE_CONFIG", + serde_yaml::to_string(&node_config.clone()) + .wrap_err("failed to serialize node config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = &node.env { + for (key, value) in envs { + command.env(key, value.to_string()); + } + } + if let Some(envs) = &n.envs { + // node has some inner env variables -> add them too + for (key, value) in envs { + command.env(key, value.to_string()); + } + } - command.current_dir(&self.working_dir); - command.stdin(Stdio::null()); + // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C + #[cfg(unix)] + command.process_group(0); - command.env( - "DORA_NODE_CONFIG", - serde_yaml::to_string(&node_config.clone()) - .wrap_err("failed to serialize node config")?, + command.env("PYTHONUNBUFFERED", "1"); + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + }; + + let error_msg = format!( + "failed to run `{}` with args `{}`", + n.path, + n.args.as_deref().unwrap_or_default(), ); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.env { - for (key, value) in envs { - command.env(key, value.to_string()); - } - } - if let Some(envs) = n.envs { - // node has some inner env variables -> add them too - for (key, value) in envs { - command.env(key, value.to_string()); + (command, error_msg) + } + dora_core::descriptor::CoreNodeKind::Runtime(n) => { + // run build commands + for operator in &n.operators { + if let Some(build) = &operator.config.build { + self.build_node(logger, &node.env, self.working_dir.clone(), build) + .await?; } } - // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C - #[cfg(unix)] - command.process_group(0); - - command.env("PYTHONUNBUFFERED", "1"); - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .wrap_err_with(move || { - format!( - "failed to run `{}` with args `{}`", - n.path, - n.args.as_deref().unwrap_or_default(), - ) - })? - } - dora_core::descriptor::CoreNodeKind::Runtime(n) => { let python_operators: Vec<&OperatorDefinition> = n .operators .iter() @@ -220,7 +225,9 @@ impl Spawner { .iter() .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); - let mut command = if !python_operators.is_empty() && !other_operators { + let mut command = if self.build_only { + None + } else if !python_operators.is_empty() && !other_operators { // Use python to spawn runtime if there is a python operator // TODO: Handle multi-operator runtime once sub-interpreter is supported @@ -253,7 +260,7 @@ impl Spawner { "-c", format!("import dora; dora.start_runtime() # {}", node.id).as_str(), ]); - command + Some(command) } else { let mut cmd = if self.uv { let mut cmd = tokio::process::Command::new("uv"); @@ -279,7 +286,7 @@ impl Spawner { "-c", format!("import dora; dora.start_runtime() # {}", node.id).as_str(), ]); - cmd + Some(cmd) } } else if python_operators.is_empty() && other_operators { let mut cmd = tokio::process::Command::new( @@ -287,41 +294,256 @@ impl Spawner { .wrap_err("failed to get current executable path")?, ); cmd.arg("runtime"); - cmd + Some(cmd) } else { eyre::bail!("Runtime can not mix Python Operator with other type of operator."); }; - command.current_dir(&self.working_dir); let runtime_config = RuntimeConfig { node: node_config.clone(), - operators: n.operators, + operators: n.operators.clone(), }; - command.env( - "DORA_RUNTIME_CONFIG", - serde_yaml::to_string(&runtime_config) - .wrap_err("failed to serialize runtime config")?, + + if let Some(command) = &mut command { + command.current_dir(&self.working_dir); + + command.env( + "DORA_RUNTIME_CONFIG", + serde_yaml::to_string(&runtime_config) + .wrap_err("failed to serialize runtime config")?, + ); + // Injecting the env variable defined in the `yaml` into + // the node runtime. + if let Some(envs) = &node.env { + for (key, value) in envs { + command.env(key, value.to_string()); + } + } + // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C + #[cfg(unix)] + command.process_group(0); + + command + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + }; + let error_msg = format!( + "failed to run runtime {}/{}", + runtime_config.node.dataflow_id, runtime_config.node.node_id ); - // Injecting the env variable defined in the `yaml` into - // the node runtime. - if let Some(envs) = node.env { - for (key, value) in envs { - command.env(key, value.to_string()); + (command, error_msg) + } + }; + Ok(PreparedNode { + command, + spawn_error_msg: error_msg, + working_dir: self.working_dir, + dataflow_id, + node, + node_config, + clock: self.clock, + daemon_tx: self.daemon_tx, + node_stderr_most_recent, + }) + } + + async fn prepare_git_node( + &mut self, + repo_addr: &String, + rev: &Option, + repos_in_use: &mut BTreeMap>, + ) -> eyre::Result { + let dataflow_id = self.dataflow_id; + let repo_url = Url::parse(repo_addr).context("failed to parse git repository URL")?; + let target_dir = self.working_dir.join("build"); + + let clone_dir_base = { + let base = { + let mut path = + target_dir.join(repo_url.host_str().context("git URL has no hostname")?); + + path.extend(repo_url.path_segments().context("no path in git URL")?); + path + }; + match rev { + None => base, + Some(rev) => match rev { + GitRepoRev::Branch(branch) => base.join("branch").join(branch), + GitRepoRev::Tag(tag) => base.join("tag").join(tag), + GitRepoRev::Rev(rev) => base.join("rev").join(rev), + }, + } + }; + let clone_dir = if clone_dir_exists(&clone_dir_base, repos_in_use) { + let used_by_other_dataflow = + self.used_by_other_dataflow(dataflow_id, &clone_dir_base, repos_in_use); + if used_by_other_dataflow { + // don't reuse, choose new directory + // (TODO reuse if still up to date) + + let dir_name = clone_dir_base.file_name().unwrap().to_str().unwrap(); + let mut i = 1; + loop { + let new_path = clone_dir_base.with_file_name(format!("{dir_name}-{i}")); + if clone_dir_exists(&new_path, repos_in_use) + && self.used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) + { + i += 1; + } else { + break new_path; } } - // Set the process group to 0 to ensure that the spawned process does not exit immediately on CTRL-C - #[cfg(unix)] - command.process_group(0); - - command - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .wrap_err(format!( - "failed to run runtime {}/{}", - runtime_config.node.dataflow_id, runtime_config.node.node_id - ))? + } else { + clone_dir_base + } + } else { + clone_dir_base + }; + let clone_dir = dunce::simplified(&clone_dir).to_owned(); + + let (reuse, checkout) = if clone_dir_exists(&clone_dir, repos_in_use) { + let empty = BTreeSet::new(); + let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); + let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); + if used_by_other_dataflow { + // TODO allow if still up to date + eyre::bail!("clone_dir is already in use by other dataflow") + } else if in_use.is_empty() { + (true, true) + } else { + (true, false) + } + } else { + (false, true) + }; + repos_in_use + .entry(clone_dir.clone()) + .or_default() + .insert(dataflow_id); + + Ok(PreparedGit { + clone_dir, + reuse, + checkout, + }) + } + + async fn git_node_spawn_command( + &mut self, + node: &dora_core::descriptor::CustomNode, + repo_addr: &String, + rev: &Option, + logger: &mut NodeLogger<'_>, + node_env: &Option>, + prepared: PreparedGit, + ) -> Result, eyre::Error> { + let PreparedGit { + clone_dir, + reuse, + checkout, + } = prepared; + + let rev_str = rev_str(rev); + let refname = rev.clone().map(|rev| match rev { + GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), + GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), + GitRepoRev::Rev(rev) => rev, + }); + + if reuse { + logger + .log( + LogLevel::Info, + None, + format!("reusing {repo_addr}{rev_str}"), + ) + .await; + let refname_cloned = refname.clone(); + let clone_dir = clone_dir.clone(); + let repository = fetch_changes(clone_dir, refname_cloned).await?; + if checkout { + checkout_tree(&repository, refname)?; + } + } else { + let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; + if checkout { + checkout_tree(&repository, refname)?; + } + }; + if let Some(build) = &node.build { + self.build_node(logger, node_env, clone_dir.clone(), build) + .await?; + } + if self.build_only { + Ok(None) + } else { + path_spawn_command(&clone_dir, self.uv, logger, node, true).await + } + } + + fn used_by_other_dataflow( + &mut self, + dataflow_id: uuid::Uuid, + clone_dir_base: &PathBuf, + repos_in_use: &mut BTreeMap>, + ) -> bool { + let empty = BTreeSet::new(); + let in_use = repos_in_use.get(clone_dir_base).unwrap_or(&empty); + let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); + used_by_other_dataflow + } + + async fn build_node( + &mut self, + logger: &mut NodeLogger<'_>, + node_env: &Option>, + working_dir: PathBuf, + build: &String, + ) -> Result<(), eyre::Error> { + logger + .log( + LogLevel::Info, + None, + format!("running build command: `{build}"), + ) + .await; + let build = build.to_owned(); + let uv = self.uv; + let node_env = node_env.clone(); + let task = tokio::task::spawn_blocking(move || { + run_build_command(&build, &working_dir, uv, &node_env).context("build command failed") + }); + task.await??; + Ok(()) + } +} + +pub struct PreparedNode { + command: Option, + spawn_error_msg: String, + working_dir: PathBuf, + dataflow_id: DataflowId, + node: ResolvedNode, + node_config: NodeConfig, + clock: Arc, + daemon_tx: mpsc::Sender>, + node_stderr_most_recent: Arc>, +} + +impl PreparedNode { + pub fn node_id(&self) -> &NodeId { + &self.node.id + } + + pub async fn spawn(mut self, logger: &mut NodeLogger<'_>) -> eyre::Result { + let mut child = match &mut self.command { + Some(command) => command.spawn().wrap_err(self.spawn_error_msg)?, + None => { + return Ok(RunningNode { + pid: None, + node_config: self.node_config, + }) } }; @@ -336,22 +558,29 @@ impl Spawner { ) .await; - let dataflow_dir: PathBuf = self.working_dir.join("out").join(dataflow_id.to_string()); + let dataflow_dir: PathBuf = self + .working_dir + .join("out") + .join(self.dataflow_id.to_string()); if !dataflow_dir.exists() { std::fs::create_dir_all(&dataflow_dir).context("could not create dataflow_dir")?; } let (tx, mut rx) = mpsc::channel(10); - let mut file = File::create(log::log_path(&self.working_dir, &dataflow_id, &node.id)) - .await - .expect("Failed to create log file"); + let mut file = File::create(log::log_path( + &self.working_dir, + &self.dataflow_id, + &self.node.id, + )) + .await + .expect("Failed to create log file"); let mut child_stdout = tokio::io::BufReader::new(child.stdout.take().expect("failed to take stdout")); let running_node = RunningNode { pid: Some(pid), - node_config, + node_config: self.node_config, }; let stdout_tx = tx.clone(); - let node_id = node.id.clone(); + let node_id = self.node.id.clone(); // Stdout listener stream tokio::spawn(async move { let mut buffer = String::new(); @@ -411,7 +640,7 @@ impl Spawner { // Stderr listener stream let stderr_tx = tx.clone(); - let node_id = node.id.clone(); + let node_id = self.node.id.clone(); let uhlc = self.clock.clone(); let daemon_tx_log = self.daemon_tx.clone(); tokio::spawn(async move { @@ -447,7 +676,7 @@ impl Spawner { buffer.push_str(&new); - node_stderr_most_recent.force_push(new); + self.node_stderr_most_recent.force_push(new); // send the buffered lines let lines = std::mem::take(&mut buffer); @@ -458,10 +687,11 @@ impl Spawner { } }); - let node_id = node.id.clone(); + let node_id = self.node.id.clone(); let (log_finish_tx, log_finish_rx) = oneshot::channel(); let clock = self.clock.clone(); let daemon_tx = self.daemon_tx.clone(); + let dataflow_id = self.dataflow_id; tokio::spawn(async move { let exit_status = NodeExitStatus::from(child.wait().await); let _ = log_finish_rx.await; @@ -478,7 +708,7 @@ impl Spawner { let _ = daemon_tx.send(event).await; }); - let node_id = node.id.clone(); + let node_id = self.node.id.clone(); let daemon_id = logger.inner().inner().daemon_id().clone(); let mut cloned_logger = logger .inner() @@ -488,6 +718,11 @@ impl Spawner { .await .context("failed to clone logger")?; + let send_stdout_to = self + .node + .send_stdout_as() + .context("Could not resolve `send_stdout_as` configuration")?; + // Log to file stream. tokio::spawn(async move { while let Some(message) = rx.recv().await { @@ -557,176 +792,6 @@ impl Spawner { }); Ok(running_node) } - - async fn prepare_git_node( - &mut self, - repo_addr: &String, - rev: &Option, - repos_in_use: &mut BTreeMap>, - ) -> eyre::Result { - let dataflow_id = self.dataflow_id; - let repo_url = Url::parse(repo_addr).context("failed to parse git repository URL")?; - let target_dir = self.working_dir.join("build"); - - let clone_dir_base = { - let base = { - let mut path = - target_dir.join(repo_url.host_str().context("git URL has no hostname")?); - - path.extend(repo_url.path_segments().context("no path in git URL")?); - path - }; - match rev { - None => base, - Some(rev) => match rev { - GitRepoRev::Branch(branch) => base.join("branch").join(branch), - GitRepoRev::Tag(tag) => base.join("tag").join(tag), - GitRepoRev::Rev(rev) => base.join("rev").join(rev), - }, - } - }; - let clone_dir = if clone_dir_exists(&clone_dir_base, repos_in_use) { - let used_by_other_dataflow = - self.used_by_other_dataflow(dataflow_id, &clone_dir_base, repos_in_use); - if used_by_other_dataflow { - // don't reuse, choose new directory - // (TODO reuse if still up to date) - - let dir_name = clone_dir_base.file_name().unwrap().to_str().unwrap(); - let mut i = 1; - loop { - let new_path = clone_dir_base.with_file_name(format!("{dir_name}-{i}")); - if clone_dir_exists(&new_path, repos_in_use) - && self.used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) - { - i += 1; - } else { - break new_path; - } - } - } else { - clone_dir_base - } - } else { - clone_dir_base - }; - let clone_dir = dunce::simplified(&clone_dir).to_owned(); - - let (reuse, checkout) = if clone_dir_exists(&clone_dir, repos_in_use) { - let empty = BTreeSet::new(); - let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); - let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); - if used_by_other_dataflow { - // TODO allow if still up to date - eyre::bail!("clone_dir is already in use by other dataflow") - } else if in_use.is_empty() { - (true, true) - } else { - (true, false) - } - } else { - (false, true) - }; - repos_in_use - .entry(clone_dir.clone()) - .or_default() - .insert(dataflow_id); - - Ok(PreparedGit { - clone_dir, - reuse, - checkout, - }) - } - - async fn spawn_git_node( - &mut self, - node: &dora_core::descriptor::CustomNode, - repo_addr: &String, - rev: &Option, - logger: &mut NodeLogger<'_>, - node_env: &Option>, - prepared: PreparedGit, - ) -> Result, eyre::Error> { - let PreparedGit { - clone_dir, - reuse, - checkout, - } = prepared; - - let rev_str = rev_str(rev); - let refname = rev.clone().map(|rev| match rev { - GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), - GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), - GitRepoRev::Rev(rev) => rev, - }); - - if reuse { - logger - .log( - LogLevel::Info, - None, - format!("reusing {repo_addr}{rev_str}"), - ) - .await; - let refname_cloned = refname.clone(); - let clone_dir = clone_dir.clone(); - let repository = fetch_changes(clone_dir, refname_cloned).await?; - if checkout { - checkout_tree(&repository, refname)?; - } - } else { - let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; - if checkout { - checkout_tree(&repository, refname)?; - } - }; - if let Some(build) = &node.build { - self.build_node(logger, node_env, clone_dir.clone(), build) - .await?; - } - if self.build_only { - Ok(None) - } else { - spawn_command_from_path(&clone_dir, self.uv, logger, node, true).await - } - } - - async fn build_node( - &mut self, - logger: &mut NodeLogger<'_>, - node_env: &Option>, - working_dir: PathBuf, - build: &String, - ) -> Result<(), eyre::Error> { - logger - .log( - LogLevel::Info, - None, - format!("running build command: `{build}"), - ) - .await; - let build = build.to_owned(); - let uv = self.uv; - let node_env = node_env.clone(); - let task = tokio::task::spawn_blocking(move || { - run_build_command(&build, &working_dir, uv, &node_env).context("build command failed") - }); - task.await??; - Ok(()) - } - - fn used_by_other_dataflow( - &mut self, - dataflow_id: uuid::Uuid, - clone_dir_base: &PathBuf, - repos_in_use: &mut BTreeMap>, - ) -> bool { - let empty = BTreeSet::new(); - let in_use = repos_in_use.get(clone_dir_base).unwrap_or(&empty); - let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); - used_by_other_dataflow - } } fn rev_str(rev: &Option) -> String { @@ -832,7 +897,7 @@ fn checkout_tree(repository: &git2::Repository, refname: Option) -> eyre Ok(()) } -async fn spawn_command_from_path( +async fn path_spawn_command( working_dir: &Path, uv: bool, logger: &mut NodeLogger<'_>, From f71ad7922e24b5948fa1e34b31bc3d05c7370b07 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Apr 2025 14:17:48 +0200 Subject: [PATCH 044/101] Refactor reuse logic and improve documentation --- binaries/daemon/src/spawn.rs | 84 ++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index f47233d8..6b9a7d89 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -402,31 +402,37 @@ impl Spawner { }; let clone_dir = dunce::simplified(&clone_dir).to_owned(); - let (reuse, checkout) = if clone_dir_exists(&clone_dir, repos_in_use) { + let reuse = if clone_dir_exists(&clone_dir, repos_in_use) { let empty = BTreeSet::new(); let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); if used_by_other_dataflow { - // TODO allow if still up to date + // The directory is currently in use by another dataflow. We currently don't + // support reusing the same clone across multiple dataflow runs. Above, we + // choose a new directory if we detect such a case. So this `if` branch + // should never be reached. eyre::bail!("clone_dir is already in use by other dataflow") } else if in_use.is_empty() { - (true, true) + // The cloned repo is not used by any dataflow, so we can safely reuse it. However, + // the clone might be still on an older commit, so we need to do a `git fetch` + // before we reuse it. + ReuseOptions::ReuseAfterFetch } else { - (true, false) + // This clone is already used for another node of this dataflow. We will do a + // `git fetch` operation for the first node of this dataflow, so we don't need + // to do it again for other nodes of the dataflow. So we can simply reuse the + // directory without doing any additional git operations. + ReuseOptions::Reuse } } else { - (false, true) + ReuseOptions::NewClone }; repos_in_use .entry(clone_dir.clone()) .or_default() .insert(dataflow_id); - Ok(PreparedGit { - clone_dir, - reuse, - checkout, - }) + Ok(PreparedGit { clone_dir, reuse }) } async fn git_node_spawn_command( @@ -438,11 +444,7 @@ impl Spawner { node_env: &Option>, prepared: PreparedGit, ) -> Result, eyre::Error> { - let PreparedGit { - clone_dir, - reuse, - checkout, - } = prepared; + let PreparedGit { clone_dir, reuse } = prepared; let rev_str = rev_str(rev); let refname = rev.clone().map(|rev| match rev { @@ -451,25 +453,33 @@ impl Spawner { GitRepoRev::Rev(rev) => rev, }); - if reuse { - logger - .log( - LogLevel::Info, - None, - format!("reusing {repo_addr}{rev_str}"), - ) - .await; - let refname_cloned = refname.clone(); - let clone_dir = clone_dir.clone(); - let repository = fetch_changes(clone_dir, refname_cloned).await?; - if checkout { + match reuse { + ReuseOptions::NewClone => { + let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; checkout_tree(&repository, refname)?; } - } else { - let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; - if checkout { + ReuseOptions::ReuseAfterFetch => { + logger + .log( + LogLevel::Info, + None, + format!("fetching changes and reusing {repo_addr}{rev_str}"), + ) + .await; + let refname_cloned = refname.clone(); + let clone_dir = clone_dir.clone(); + let repository = fetch_changes(clone_dir, refname_cloned).await?; checkout_tree(&repository, refname)?; } + ReuseOptions::Reuse => { + logger + .log( + LogLevel::Info, + None, + format!("reusing up-to-date {repo_addr}{rev_str}"), + ) + .await; + } }; if let Some(build) = &node.build { self.build_node(logger, node_env, clone_dir.clone(), build) @@ -995,9 +1005,19 @@ async fn path_spawn_command( } struct PreparedGit { + /// The directory that should contain the checked-out repository. clone_dir: PathBuf, - reuse: bool, - checkout: bool, + /// Specifies whether an existing repo should be reused. + reuse: ReuseOptions, +} + +enum ReuseOptions { + /// Create a new clone of the repository. + NewClone, + /// Reuse an existing up-to-date clone of the repository. + Reuse, + /// Update an older clone of the repository, then reuse it. + ReuseAfterFetch, } fn clone_dir_exists(dir: &PathBuf, repos_in_use: &BTreeMap>) -> bool { From 80436901cefca957d3d89b4ddae71fa268822065 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Apr 2025 14:19:07 +0200 Subject: [PATCH 045/101] Rename module from `spawn.rs` to `spawn/mod.rs` to prepare for submodules --- binaries/daemon/src/{spawn.rs => spawn/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename binaries/daemon/src/{spawn.rs => spawn/mod.rs} (100%) diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn/mod.rs similarity index 100% rename from binaries/daemon/src/spawn.rs rename to binaries/daemon/src/spawn/mod.rs From a58936fa0956ce0f09ff821ba513d65c554890fb Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Apr 2025 14:48:29 +0200 Subject: [PATCH 046/101] Refactor: Move git operations to submodule --- binaries/daemon/src/spawn/git.rs | 286 ++++++++++++++++++++++++++ binaries/daemon/src/spawn/mod.rs | 338 +++---------------------------- 2 files changed, 315 insertions(+), 309 deletions(-) create mode 100644 binaries/daemon/src/spawn/git.rs diff --git a/binaries/daemon/src/spawn/git.rs b/binaries/daemon/src/spawn/git.rs new file mode 100644 index 00000000..9803d1f2 --- /dev/null +++ b/binaries/daemon/src/spawn/git.rs @@ -0,0 +1,286 @@ +use crate::log::NodeLogger; +use dora_message::{common::LogLevel, descriptor::GitRepoRev, DataflowId}; +use eyre::{ContextCompat, WrapErr}; +use git2::FetchOptions; +use std::{ + collections::{BTreeMap, BTreeSet}, + path::{Path, PathBuf}, +}; +use url::Url; +use uuid::Uuid; + +pub struct GitFolder { + /// The URL of the git repository. + repo_addr: String, + /// The branch, tag, or git revision to checkout. + rev: Option, + /// The directory that should contain the checked-out repository. + clone_dir: PathBuf, + /// Specifies whether an existing repo should be reused. + reuse: ReuseOptions, +} + +impl GitFolder { + pub fn choose_clone_dir( + dataflow_id: uuid::Uuid, + repo_addr: String, + rev: Option, + target_dir: &Path, + repos_in_use: &mut BTreeMap>, + ) -> eyre::Result { + let repo_url = Url::parse(&repo_addr).context("failed to parse git repository URL")?; + + let base_dir = { + let base = { + let mut path = + target_dir.join(repo_url.host_str().context("git URL has no hostname")?); + + path.extend(repo_url.path_segments().context("no path in git URL")?); + path + }; + match &rev { + None => base, + Some(rev) => match rev { + GitRepoRev::Branch(branch) => base.join("branch").join(branch), + GitRepoRev::Tag(tag) => base.join("tag").join(tag), + GitRepoRev::Rev(rev) => base.join("rev").join(rev), + }, + } + }; + let clone_dir = if clone_dir_exists(&base_dir, repos_in_use) { + let used_by_other = used_by_other_dataflow(dataflow_id, &base_dir, repos_in_use); + if used_by_other { + // don't reuse, choose new directory + // (TODO reuse if still up to date) + + let dir_name = base_dir.file_name().unwrap().to_str().unwrap(); + let mut i = 1; + loop { + let new_path = base_dir.with_file_name(format!("{dir_name}-{i}")); + if clone_dir_exists(&new_path, repos_in_use) + && used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) + { + i += 1; + } else { + break new_path; + } + } + } else { + base_dir + } + } else { + base_dir + }; + let clone_dir = dunce::simplified(&clone_dir).to_owned(); + + let reuse = if clone_dir_exists(&clone_dir, repos_in_use) { + let empty = BTreeSet::new(); + let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); + let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); + if used_by_other_dataflow { + // The directory is currently in use by another dataflow. We currently don't + // support reusing the same clone across multiple dataflow runs. Above, we + // choose a new directory if we detect such a case. So this `if` branch + // should never be reached. + eyre::bail!("clone_dir is already in use by other dataflow") + } else if in_use.is_empty() { + // The cloned repo is not used by any dataflow, so we can safely reuse it. However, + // the clone might be still on an older commit, so we need to do a `git fetch` + // before we reuse it. + ReuseOptions::ReuseAfterFetch + } else { + // This clone is already used for another node of this dataflow. We will do a + // `git fetch` operation for the first node of this dataflow, so we don't need + // to do it again for other nodes of the dataflow. So we can simply reuse the + // directory without doing any additional git operations. + ReuseOptions::Reuse + } + } else { + ReuseOptions::NewClone + }; + repos_in_use + .entry(clone_dir.clone()) + .or_default() + .insert(dataflow_id); + + Ok(GitFolder { + clone_dir, + reuse, + repo_addr, + rev, + }) + } + + pub async fn prepare(self, logger: &mut NodeLogger<'_>) -> eyre::Result { + let GitFolder { + clone_dir, + reuse, + repo_addr, + rev, + } = self; + + let rev_str = rev_str(&rev); + let refname = rev.clone().map(|rev| match rev { + GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), + GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), + GitRepoRev::Rev(rev) => rev, + }); + + match reuse { + ReuseOptions::NewClone => { + let repository = clone_into(&repo_addr, &rev, &clone_dir, logger).await?; + checkout_tree(&repository, refname)?; + } + ReuseOptions::ReuseAfterFetch => { + logger + .log( + LogLevel::Info, + None, + format!("fetching changes and reusing {repo_addr}{rev_str}"), + ) + .await; + let refname_cloned = refname.clone(); + let clone_dir = clone_dir.clone(); + let repository = fetch_changes(clone_dir, refname_cloned).await?; + checkout_tree(&repository, refname)?; + } + ReuseOptions::Reuse => { + logger + .log( + LogLevel::Info, + None, + format!("reusing up-to-date {repo_addr}{rev_str}"), + ) + .await; + } + }; + Ok(clone_dir) + } +} + +fn used_by_other_dataflow( + dataflow_id: uuid::Uuid, + clone_dir_base: &PathBuf, + repos_in_use: &mut BTreeMap>, +) -> bool { + let empty = BTreeSet::new(); + let in_use = repos_in_use.get(clone_dir_base).unwrap_or(&empty); + let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); + used_by_other_dataflow +} + +enum ReuseOptions { + /// Create a new clone of the repository. + NewClone, + /// Reuse an existing up-to-date clone of the repository. + Reuse, + /// Update an older clone of the repository, then reuse it. + ReuseAfterFetch, +} + +fn rev_str(rev: &Option) -> String { + match rev { + Some(GitRepoRev::Branch(branch)) => format!(" (branch {branch})"), + Some(GitRepoRev::Tag(tag)) => format!(" (tag {tag})"), + Some(GitRepoRev::Rev(rev)) => format!(" (rev {rev})"), + None => String::new(), + } +} + +async fn clone_into( + repo_addr: &String, + rev: &Option, + clone_dir: &Path, + logger: &mut NodeLogger<'_>, +) -> eyre::Result { + if let Some(parent) = clone_dir.parent() { + tokio::fs::create_dir_all(parent) + .await + .context("failed to create parent directory for git clone")?; + } + + let rev_str = rev_str(rev); + logger + .log( + LogLevel::Info, + None, + format!("cloning {repo_addr}{rev_str} into {}", clone_dir.display()), + ) + .await; + let rev: Option = rev.clone(); + let clone_into = clone_dir.to_owned(); + let repo_addr = repo_addr.clone(); + let task = tokio::task::spawn_blocking(move || { + let mut builder = git2::build::RepoBuilder::new(); + let mut fetch_options = git2::FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + builder.fetch_options(fetch_options); + if let Some(GitRepoRev::Branch(branch)) = &rev { + builder.branch(branch); + } + builder + .clone(&repo_addr, &clone_into) + .context("failed to clone repo") + }); + let repo = task.await??; + Ok(repo) +} + +async fn fetch_changes( + repo_dir: PathBuf, + refname: Option, +) -> Result { + let fetch_changes = tokio::task::spawn_blocking(move || { + let repository = git2::Repository::open(&repo_dir).context("failed to open git repo")?; + + { + let mut remote = repository + .find_remote("origin") + .context("failed to find remote `origin` in repo")?; + remote + .connect(git2::Direction::Fetch) + .context("failed to connect to remote")?; + let default_branch = remote + .default_branch() + .context("failed to get default branch for remote")?; + let fetch = match &refname { + Some(refname) => refname, + None => default_branch + .as_str() + .context("failed to read default branch as string")?, + }; + let mut fetch_options = FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + remote + .fetch(&[&fetch], Some(&mut fetch_options), None) + .context("failed to fetch from git repo")?; + } + Result::<_, eyre::Error>::Ok(repository) + }); + let repository = fetch_changes.await??; + Ok(repository) +} + +fn checkout_tree(repository: &git2::Repository, refname: Option) -> eyre::Result<()> { + if let Some(refname) = refname { + let (object, reference) = repository + .revparse_ext(&refname) + .context("failed to parse ref")?; + repository + .checkout_tree(&object, None) + .context("failed to checkout ref")?; + match reference { + Some(reference) => repository + .set_head(reference.name().context("failed to get reference_name")?) + .context("failed to set head")?, + None => repository + .set_head_detached(object.id()) + .context("failed to set detached head")?, + } + } + Ok(()) +} + +fn clone_dir_exists(dir: &PathBuf, repos_in_use: &BTreeMap>) -> bool { + repos_in_use.contains_key(dir) || dir.exists() +} diff --git a/binaries/daemon/src/spawn/mod.rs b/binaries/daemon/src/spawn/mod.rs index 6b9a7d89..c0aaf13c 100644 --- a/binaries/daemon/src/spawn/mod.rs +++ b/binaries/daemon/src/spawn/mod.rs @@ -21,7 +21,7 @@ use dora_message::{ common::{LogLevel, LogMessage}, daemon_to_coordinator::{DataMessage, NodeExitStatus, Timestamped}, daemon_to_node::{NodeConfig, RuntimeConfig}, - descriptor::{EnvValue, GitRepoRev}, + descriptor::EnvValue, id::NodeId, DataflowId, }; @@ -31,7 +31,7 @@ use dora_node_api::{ Metadata, }; use eyre::{ContextCompat, WrapErr}; -use git2::FetchOptions; +use git::GitFolder; use std::{ collections::{BTreeMap, BTreeSet}, future::Future, @@ -45,8 +45,8 @@ use tokio::{ sync::{mpsc, oneshot}, }; use tracing::error; -use url::Url; -use uuid::Uuid; + +mod git; #[derive(Clone)] pub struct Spawner { @@ -62,7 +62,7 @@ pub struct Spawner { impl Spawner { pub async fn prepare_node( - mut self, + self, node: ResolvedNode, node_stderr_most_recent: Arc>, logger: &mut NodeLogger<'_>, @@ -106,7 +106,15 @@ impl Spawner { .. }) = &node.kind { - Some(self.prepare_git_node(repo, rev, repos_in_use).await?) + let target_dir = self.working_dir.join("build"); + let git_folder = GitFolder::choose_clone_dir( + self.dataflow_id, + repo.clone(), + rev.clone(), + &target_dir, + repos_in_use, + )?; + Some(git_folder) } else { None }; @@ -135,35 +143,26 @@ impl Spawner { logger: &mut NodeLogger<'_>, dataflow_id: uuid::Uuid, node_config: NodeConfig, - prepared_git: Option, + git_folder: Option, node_stderr_most_recent: Arc>, ) -> eyre::Result { let (command, error_msg) = match &node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { - let mut command = match &n.source { - dora_message::descriptor::NodeSource::Local => { - if let Some(build) = &n.build { - self.build_node(logger, &node.env, self.working_dir.clone(), build) - .await?; - } - if self.build_only { - None - } else { - path_spawn_command(&self.working_dir, self.uv, logger, &n, true).await? - } - } - dora_message::descriptor::NodeSource::GitBranch { repo, rev } => { - self.git_node_spawn_command( - &n, - repo, - rev, - logger, - &node.env, - prepared_git.unwrap(), - ) - .await? - } + let build_dir = match git_folder { + Some(git_folder) => git_folder.prepare(logger).await?, + None => self.working_dir.clone(), + }; + + if let Some(build) = &n.build { + self.build_node(logger, &node.env, build_dir.clone(), build) + .await?; + } + let mut command = if self.build_only { + None + } else { + path_spawn_command(&build_dir, self.uv, logger, n, true).await? }; + if let Some(command) = &mut command { command.current_dir(&self.working_dir); command.stdin(Stdio::null()); @@ -348,162 +347,6 @@ impl Spawner { }) } - async fn prepare_git_node( - &mut self, - repo_addr: &String, - rev: &Option, - repos_in_use: &mut BTreeMap>, - ) -> eyre::Result { - let dataflow_id = self.dataflow_id; - let repo_url = Url::parse(repo_addr).context("failed to parse git repository URL")?; - let target_dir = self.working_dir.join("build"); - - let clone_dir_base = { - let base = { - let mut path = - target_dir.join(repo_url.host_str().context("git URL has no hostname")?); - - path.extend(repo_url.path_segments().context("no path in git URL")?); - path - }; - match rev { - None => base, - Some(rev) => match rev { - GitRepoRev::Branch(branch) => base.join("branch").join(branch), - GitRepoRev::Tag(tag) => base.join("tag").join(tag), - GitRepoRev::Rev(rev) => base.join("rev").join(rev), - }, - } - }; - let clone_dir = if clone_dir_exists(&clone_dir_base, repos_in_use) { - let used_by_other_dataflow = - self.used_by_other_dataflow(dataflow_id, &clone_dir_base, repos_in_use); - if used_by_other_dataflow { - // don't reuse, choose new directory - // (TODO reuse if still up to date) - - let dir_name = clone_dir_base.file_name().unwrap().to_str().unwrap(); - let mut i = 1; - loop { - let new_path = clone_dir_base.with_file_name(format!("{dir_name}-{i}")); - if clone_dir_exists(&new_path, repos_in_use) - && self.used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) - { - i += 1; - } else { - break new_path; - } - } - } else { - clone_dir_base - } - } else { - clone_dir_base - }; - let clone_dir = dunce::simplified(&clone_dir).to_owned(); - - let reuse = if clone_dir_exists(&clone_dir, repos_in_use) { - let empty = BTreeSet::new(); - let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); - let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); - if used_by_other_dataflow { - // The directory is currently in use by another dataflow. We currently don't - // support reusing the same clone across multiple dataflow runs. Above, we - // choose a new directory if we detect such a case. So this `if` branch - // should never be reached. - eyre::bail!("clone_dir is already in use by other dataflow") - } else if in_use.is_empty() { - // The cloned repo is not used by any dataflow, so we can safely reuse it. However, - // the clone might be still on an older commit, so we need to do a `git fetch` - // before we reuse it. - ReuseOptions::ReuseAfterFetch - } else { - // This clone is already used for another node of this dataflow. We will do a - // `git fetch` operation for the first node of this dataflow, so we don't need - // to do it again for other nodes of the dataflow. So we can simply reuse the - // directory without doing any additional git operations. - ReuseOptions::Reuse - } - } else { - ReuseOptions::NewClone - }; - repos_in_use - .entry(clone_dir.clone()) - .or_default() - .insert(dataflow_id); - - Ok(PreparedGit { clone_dir, reuse }) - } - - async fn git_node_spawn_command( - &mut self, - node: &dora_core::descriptor::CustomNode, - repo_addr: &String, - rev: &Option, - logger: &mut NodeLogger<'_>, - node_env: &Option>, - prepared: PreparedGit, - ) -> Result, eyre::Error> { - let PreparedGit { clone_dir, reuse } = prepared; - - let rev_str = rev_str(rev); - let refname = rev.clone().map(|rev| match rev { - GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), - GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), - GitRepoRev::Rev(rev) => rev, - }); - - match reuse { - ReuseOptions::NewClone => { - let repository = clone_into(repo_addr, rev, &clone_dir, logger).await?; - checkout_tree(&repository, refname)?; - } - ReuseOptions::ReuseAfterFetch => { - logger - .log( - LogLevel::Info, - None, - format!("fetching changes and reusing {repo_addr}{rev_str}"), - ) - .await; - let refname_cloned = refname.clone(); - let clone_dir = clone_dir.clone(); - let repository = fetch_changes(clone_dir, refname_cloned).await?; - checkout_tree(&repository, refname)?; - } - ReuseOptions::Reuse => { - logger - .log( - LogLevel::Info, - None, - format!("reusing up-to-date {repo_addr}{rev_str}"), - ) - .await; - } - }; - if let Some(build) = &node.build { - self.build_node(logger, node_env, clone_dir.clone(), build) - .await?; - } - if self.build_only { - Ok(None) - } else { - path_spawn_command(&clone_dir, self.uv, logger, node, true).await - } - } - - fn used_by_other_dataflow( - &mut self, - dataflow_id: uuid::Uuid, - clone_dir_base: &PathBuf, - repos_in_use: &mut BTreeMap>, - ) -> bool { - let empty = BTreeSet::new(); - let in_use = repos_in_use.get(clone_dir_base).unwrap_or(&empty); - let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); - used_by_other_dataflow - } - async fn build_node( &mut self, logger: &mut NodeLogger<'_>, @@ -804,109 +647,6 @@ impl PreparedNode { } } -fn rev_str(rev: &Option) -> String { - match rev { - Some(GitRepoRev::Branch(branch)) => format!(" (branch {branch})"), - Some(GitRepoRev::Tag(tag)) => format!(" (tag {tag})"), - Some(GitRepoRev::Rev(rev)) => format!(" (rev {rev})"), - None => String::new(), - } -} - -async fn clone_into( - repo_addr: &String, - rev: &Option, - clone_dir: &Path, - logger: &mut NodeLogger<'_>, -) -> eyre::Result { - if let Some(parent) = clone_dir.parent() { - tokio::fs::create_dir_all(parent) - .await - .context("failed to create parent directory for git clone")?; - } - - let rev_str = rev_str(rev); - logger - .log( - LogLevel::Info, - None, - format!("cloning {repo_addr}{rev_str} into {}", clone_dir.display()), - ) - .await; - let rev: Option = rev.clone(); - let clone_into = clone_dir.to_owned(); - let repo_addr = repo_addr.clone(); - let task = tokio::task::spawn_blocking(move || { - let mut builder = git2::build::RepoBuilder::new(); - let mut fetch_options = git2::FetchOptions::new(); - fetch_options.download_tags(git2::AutotagOption::All); - builder.fetch_options(fetch_options); - if let Some(GitRepoRev::Branch(branch)) = &rev { - builder.branch(branch); - } - builder - .clone(&repo_addr, &clone_into) - .context("failed to clone repo") - }); - let repo = task.await??; - Ok(repo) -} - -async fn fetch_changes( - repo_dir: PathBuf, - refname: Option, -) -> Result { - let fetch_changes = tokio::task::spawn_blocking(move || { - let repository = git2::Repository::open(&repo_dir).context("failed to open git repo")?; - - { - let mut remote = repository - .find_remote("origin") - .context("failed to find remote `origin` in repo")?; - remote - .connect(git2::Direction::Fetch) - .context("failed to connect to remote")?; - let default_branch = remote - .default_branch() - .context("failed to get default branch for remote")?; - let fetch = match &refname { - Some(refname) => refname, - None => default_branch - .as_str() - .context("failed to read default branch as string")?, - }; - let mut fetch_options = FetchOptions::new(); - fetch_options.download_tags(git2::AutotagOption::All); - remote - .fetch(&[&fetch], Some(&mut fetch_options), None) - .context("failed to fetch from git repo")?; - } - Result::<_, eyre::Error>::Ok(repository) - }); - let repository = fetch_changes.await??; - Ok(repository) -} - -fn checkout_tree(repository: &git2::Repository, refname: Option) -> eyre::Result<()> { - if let Some(refname) = refname { - let (object, reference) = repository - .revparse_ext(&refname) - .context("failed to parse ref")?; - repository - .checkout_tree(&object, None) - .context("failed to checkout ref")?; - match reference { - Some(reference) => repository - .set_head(reference.name().context("failed to get reference_name")?) - .context("failed to set head")?, - None => repository - .set_head_detached(object.id()) - .context("failed to set detached head")?, - } - } - Ok(()) -} - async fn path_spawn_command( working_dir: &Path, uv: bool, @@ -1003,23 +743,3 @@ async fn path_spawn_command( Ok(Some(cmd)) } - -struct PreparedGit { - /// The directory that should contain the checked-out repository. - clone_dir: PathBuf, - /// Specifies whether an existing repo should be reused. - reuse: ReuseOptions, -} - -enum ReuseOptions { - /// Create a new clone of the repository. - NewClone, - /// Reuse an existing up-to-date clone of the repository. - Reuse, - /// Update an older clone of the repository, then reuse it. - ReuseAfterFetch, -} - -fn clone_dir_exists(dir: &PathBuf, repos_in_use: &BTreeMap>) -> bool { - repos_in_use.contains_key(dir) || dir.exists() -} From 90733aabeb5c79ad2ef807a5212a994c669ce12c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Apr 2025 16:02:49 +0200 Subject: [PATCH 047/101] Send build output to CLI The coordinator now sends an immediate `DataflowStartTriggered` reply when receiving a `DataflowStart` command. This enables the CLI to directly attach to the dataflow and observe the build output. To wait until the build/spawning is done, this commit introduces a new `WaitForSpawn` command, to which the coordinator replies with a `DataflowSpawned` message once the node has been started. We use this command for the `dora build` command. --- binaries/cli/src/attach.rs | 72 +++++++++++---------- binaries/cli/src/lib.rs | 39 +++++++++-- binaries/coordinator/src/lib.rs | 28 +++++--- binaries/daemon/src/spawn/mod.rs | 16 ++++- examples/multiple-daemons/run.rs | 16 ++++- libraries/core/src/build.rs | 37 ++++++++++- libraries/message/src/cli_to_coordinator.rs | 3 + libraries/message/src/coordinator_to_cli.rs | 3 +- 8 files changed, 162 insertions(+), 52 deletions(-) diff --git a/binaries/cli/src/attach.rs b/binaries/cli/src/attach.rs index 39c3a056..e40be1d8 100644 --- a/binaries/cli/src/attach.rs +++ b/binaries/cli/src/attach.rs @@ -155,39 +155,7 @@ pub fn attach_dataflow( }, Ok(AttachEvent::Control(control_request)) => control_request, Ok(AttachEvent::Log(Ok(log_message))) => { - let LogMessage { - dataflow_id, - node_id, - daemon_id, - level, - target, - module_path: _, - file: _, - line: _, - message, - } = log_message; - let level = match level { - log::Level::Error => "ERROR".red(), - log::Level::Warn => "WARN ".yellow(), - log::Level::Info => "INFO ".green(), - other => format!("{other:5}").normal(), - }; - let dataflow = format!(" dataflow `{dataflow_id}`").cyan(); - let daemon = match daemon_id { - Some(id) => format!(" on daemon `{id}`"), - None => " on default daemon".to_string(), - } - .bright_black(); - let node = match node_id { - Some(node_id) => format!(" {node_id}").bold(), - None => "".normal(), - }; - let target = match target { - Some(target) => format!(" {target}").dimmed(), - None => "".normal(), - }; - - println!("{level}{dataflow}{daemon}{node}{target}: {message}"); + print_log_message(log_message); continue; } Ok(AttachEvent::Log(Err(err))) => { @@ -202,7 +170,7 @@ pub fn attach_dataflow( let result: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; match result { - ControlRequestReply::DataflowStarted { uuid: _ } => (), + ControlRequestReply::DataflowSpawned { uuid: _ } => (), ControlRequestReply::DataflowStopped { uuid, result } => { info!("dataflow {uuid} stopped"); break handle_dataflow_result(result, Some(uuid)); @@ -215,6 +183,42 @@ pub fn attach_dataflow( } } +pub fn print_log_message(log_message: LogMessage) { + let LogMessage { + dataflow_id, + node_id, + daemon_id, + level, + target, + module_path: _, + file: _, + line: _, + message, + } = log_message; + let level = match level { + log::Level::Error => "ERROR".red(), + log::Level::Warn => "WARN ".yellow(), + log::Level::Info => "INFO ".green(), + other => format!("{other:5}").normal(), + }; + let dataflow = format!(" dataflow `{dataflow_id}`").cyan(); + let daemon = match daemon_id { + Some(id) => format!(" on daemon `{id}`"), + None => " on default daemon".to_string(), + } + .bright_black(); + let node = match node_id { + Some(node_id) => format!(" {node_id}").bold(), + None => "".normal(), + }; + let target = match target { + Some(target) => format!(" {target}").dimmed(), + None => "".normal(), + }; + + println!("{level}{dataflow}{daemon}{node}{target}: {message}"); +} + enum AttachEvent { Control(ControlRequest), Log(eyre::Result), diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index 2b667b0d..f1636273 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -377,7 +377,10 @@ fn run(args: Args) -> eyre::Result<()> { coordinator_port, uv, } => { - start_dataflow(dataflow, None, coordinator_addr, coordinator_port, uv, true)?; + let (_, _, _, mut session, uuid) = + start_dataflow(dataflow, None, coordinator_addr, coordinator_port, uv, true)?; + // wait until build is finished + wait_until_dataflow_started(uuid, &mut session, true)?; } Command::New { args, @@ -459,6 +462,9 @@ fn run(args: Args) -> eyre::Result<()> { coordinator_socket, log_level, )? + } else { + // wait until dataflow is started + wait_until_dataflow_started(dataflow_id, &mut session, false)?; } } Command::List { @@ -659,11 +665,11 @@ fn start_dataflow( let result: ControlRequestReply = serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; match result { - ControlRequestReply::DataflowStarted { uuid } => { + ControlRequestReply::DataflowStartTriggered { uuid } => { if build_only { - eprintln!("dataflow build successful"); + eprintln!("dataflow build triggered"); } else { - eprintln!("{uuid}"); + eprintln!("dataflow start triggered: {uuid}"); } uuid } @@ -680,6 +686,31 @@ fn start_dataflow( )) } +fn wait_until_dataflow_started( + dataflow_id: Uuid, + session: &mut Box, + build_only: bool, +) -> eyre::Result<()> { + let reply_raw = session + .request(&serde_json::to_vec(&ControlRequest::WaitForSpawn { dataflow_id }).unwrap()) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowSpawned { uuid } => { + if build_only { + eprintln!("dataflow build finished"); + } else { + eprintln!("dataflow started: {uuid}"); + } + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } + Ok(()) +} + fn stop_dataflow_interactive( grace_duration: Option, session: &mut TcpRequestReplyConnection, diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 422043ca..d2967138 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -421,18 +421,26 @@ async fn start_inner( Ok(dataflow) }; match inner.await { - Ok(mut dataflow) => { - dataflow.spawn_result_tx = Some(reply_sender); - running_dataflows.insert(dataflow.uuid, dataflow); + Ok(dataflow) => { + let uuid = dataflow.uuid; + running_dataflows.insert(uuid, dataflow); + let _ = reply_sender.send(Ok( + ControlRequestReply::DataflowStartTriggered { uuid }, + )); } Err(err) => { let _ = reply_sender.send(Err(err)); } } } + ControlRequest::WaitForSpawn { dataflow_id } => { + if let Some(dataflow) = running_dataflows.get_mut(&dataflow_id) { + dataflow.spawn_result_tx.push(reply_sender); + } + } ControlRequest::Check { dataflow_uuid } => { let status = match &running_dataflows.get(&dataflow_uuid) { - Some(_) => ControlRequestReply::DataflowStarted { + Some(_) => ControlRequestReply::DataflowSpawned { uuid: dataflow_uuid, }, None => ControlRequestReply::DataflowStopped { @@ -726,9 +734,9 @@ async fn start_inner( "spawned" } ); - if let Some(reply_tx) = dataflow.spawn_result_tx.take() { + for reply_tx in dataflow.spawn_result_tx.drain(..) { let _ = - reply_tx.send(Ok(ControlRequestReply::DataflowStarted { + reply_tx.send(Ok(ControlRequestReply::DataflowSpawned { uuid: dataflow_id, })); } @@ -739,8 +747,8 @@ async fn start_inner( } Err(err) => { tracing::warn!("error while spawning dataflow `{dataflow_id}`"); - if let Some(reply_tx) = dataflow.spawn_result_tx.take() { - let _ = reply_tx.send(Err(err)); + for reply_tx in dataflow.spawn_result_tx.drain(..) { + let _ = reply_tx.send(Err(eyre!(format!("{err:?}")))); } } }; @@ -857,7 +865,7 @@ struct RunningDataflow { log_subscribers: Vec, pending_spawn_results: BTreeSet, - spawn_result_tx: Option>>, + spawn_result_tx: Vec>>, build_only: bool, } @@ -1088,7 +1096,7 @@ async fn start_dataflow( reply_senders: Vec::new(), log_subscribers: Vec::new(), pending_spawn_results: daemons, - spawn_result_tx: None, + spawn_result_tx: Vec::new(), build_only, }) } diff --git a/binaries/daemon/src/spawn/mod.rs b/binaries/daemon/src/spawn/mod.rs index c0aaf13c..1d1eedaa 100644 --- a/binaries/daemon/src/spawn/mod.rs +++ b/binaries/daemon/src/spawn/mod.rs @@ -364,8 +364,22 @@ impl Spawner { let build = build.to_owned(); let uv = self.uv; let node_env = node_env.clone(); + let mut logger = logger.try_clone().await.context("failed to clone logger")?; + let (stdout_tx, mut stdout) = tokio::sync::mpsc::channel(10); let task = tokio::task::spawn_blocking(move || { - run_build_command(&build, &working_dir, uv, &node_env).context("build command failed") + run_build_command(&build, &working_dir, uv, &node_env, stdout_tx) + .context("build command failed") + }); + tokio::spawn(async move { + while let Some(line) = stdout.recv().await { + logger + .log( + LogLevel::Info, + Some("build command".into()), + line.unwrap_or_else(|err| format!("io err: {}", err.kind())), + ) + .await; + } }); task.await??; Ok(()) diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index a042253b..130d43c1 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -153,7 +153,21 @@ async fn start_dataflow( .await?; let result = reply.await??; let uuid = match result { - ControlRequestReply::DataflowStarted { uuid } => uuid, + ControlRequestReply::DataflowStartTriggered { uuid } => uuid, + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + }; + + let (reply_sender, reply) = oneshot::channel(); + coordinator_events_tx + .send(Event::Control(ControlEvent::IncomingRequest { + request: ControlRequest::WaitForSpawn { dataflow_id: uuid }, + reply_sender, + })) + .await?; + let result = reply.await??; + let uuid = match result { + ControlRequestReply::DataflowSpawned { uuid } => uuid, ControlRequestReply::Error(err) => bail!("{err}"), other => bail!("unexpected start dataflow reply: {other:?}"), }; diff --git a/libraries/core/src/build.rs b/libraries/core/src/build.rs index 7672b66e..d04d35d3 100644 --- a/libraries/core/src/build.rs +++ b/libraries/core/src/build.rs @@ -1,4 +1,9 @@ -use std::{collections::BTreeMap, path::Path, process::Command}; +use std::{ + collections::BTreeMap, + io::{BufRead, BufReader}, + path::Path, + process::{Command, Stdio}, +}; use dora_message::descriptor::EnvValue; use eyre::{eyre, Context}; @@ -8,6 +13,7 @@ pub fn run_build_command( working_dir: &Path, uv: bool, envs: &Option>, + stdout_tx: tokio::sync::mpsc::Sender>, ) -> eyre::Result<()> { let lines = build.lines().collect::>(); for build_line in lines { @@ -34,6 +40,35 @@ pub fn run_build_command( } cmd.current_dir(dunce::simplified(working_dir)); + + cmd.stdin(Stdio::null()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + let mut child = cmd + .spawn() + .wrap_err_with(|| format!("failed to spawn `{}`", build))?; + + let child_stdout = BufReader::new(child.stdout.take().expect("failed to take stdout")); + let child_stderr = BufReader::new(child.stderr.take().expect("failed to take stderr")); + let stderr_tx = stdout_tx.clone(); + let stdout_tx = stdout_tx.clone(); + + std::thread::spawn(move || { + for line in child_stdout.lines() { + if stdout_tx.blocking_send(line).is_err() { + break; + } + } + }); + std::thread::spawn(move || { + for line in child_stderr.lines() { + if stderr_tx.blocking_send(line).is_err() { + break; + } + } + }); + let exit_status = cmd .status() .wrap_err_with(|| format!("failed to run `{}`", build))?; diff --git a/libraries/message/src/cli_to_coordinator.rs b/libraries/message/src/cli_to_coordinator.rs index ab91f449..8436a11a 100644 --- a/libraries/message/src/cli_to_coordinator.rs +++ b/libraries/message/src/cli_to_coordinator.rs @@ -18,6 +18,9 @@ pub enum ControlRequest { uv: bool, build_only: bool, }, + WaitForSpawn { + dataflow_id: Uuid, + }, Reload { dataflow_id: Uuid, node_id: NodeId, diff --git a/libraries/message/src/coordinator_to_cli.rs b/libraries/message/src/coordinator_to_cli.rs index c8f1d3c8..87eb7ae7 100644 --- a/libraries/message/src/coordinator_to_cli.rs +++ b/libraries/message/src/coordinator_to_cli.rs @@ -9,7 +9,8 @@ use crate::{common::DaemonId, id::NodeId}; pub enum ControlRequestReply { Error(String), CoordinatorStopped, - DataflowStarted { uuid: Uuid }, + DataflowStartTriggered { uuid: Uuid }, + DataflowSpawned { uuid: Uuid }, DataflowReloaded { uuid: Uuid }, DataflowStopped { uuid: Uuid, result: DataflowResult }, DataflowList(DataflowList), From c9a720d7ae76ba20d3b612f3de0d297486d3a51f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Apr 2025 17:41:06 +0200 Subject: [PATCH 048/101] Error if coordinator doesn't reply properly Instead of sending `CoordinatorStopped`. This should make it easier to debug when a reply is not sent as expected. --- binaries/coordinator/src/control.rs | 4 ++-- libraries/message/src/cli_to_coordinator.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/binaries/coordinator/src/control.rs b/binaries/coordinator/src/control.rs index f7cb23c1..c0e92417 100644 --- a/binaries/coordinator/src/control.rs +++ b/binaries/coordinator/src/control.rs @@ -155,7 +155,7 @@ async fn handle_request( ) -> eyre::Result { let (reply_tx, reply_rx) = oneshot::channel(); let event = ControlEvent::IncomingRequest { - request, + request: request.clone(), reply_sender: reply_tx, }; @@ -165,7 +165,7 @@ async fn handle_request( reply_rx .await - .unwrap_or(Ok(ControlRequestReply::CoordinatorStopped)) + .wrap_err_with(|| format!("no coordinator reply to {request:?}"))? } #[derive(Debug)] diff --git a/libraries/message/src/cli_to_coordinator.rs b/libraries/message/src/cli_to_coordinator.rs index 8436a11a..456bb1bd 100644 --- a/libraries/message/src/cli_to_coordinator.rs +++ b/libraries/message/src/cli_to_coordinator.rs @@ -7,7 +7,7 @@ use crate::{ id::{NodeId, OperatorId}, }; -#[derive(Debug, serde::Deserialize, serde::Serialize)] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub enum ControlRequest { Start { dataflow: Descriptor, From 95709b44b4d6ffb0aa18ca4c21924025d9d265d1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 28 Apr 2025 19:46:00 +0200 Subject: [PATCH 049/101] Refactor spawn result recording to fix `WaitForSpawnResult` handling --- binaries/coordinator/src/lib.rs | 99 +++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 18 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index d2967138..1302662c 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -30,7 +30,11 @@ use std::{ sync::Arc, time::{Duration, Instant}, }; -use tokio::{net::TcpStream, sync::mpsc, task::JoinHandle}; +use tokio::{ + net::TcpStream, + sync::{mpsc, oneshot}, + task::JoinHandle, +}; use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream}; use uuid::Uuid; @@ -371,9 +375,15 @@ async fn start_inner( DataflowResult::ok_empty(uuid, clock.new_timestamp()) }), }; - for sender in finished_dataflow.reply_senders { + for sender in finished_dataflow.stop_reply_senders { let _ = sender.send(Ok(reply.clone())); } + if !matches!( + finished_dataflow.spawn_result, + SpawnResult::Spawned { .. } + ) { + log::error!("pending spawn result on dataflow finish"); + } } } std::collections::hash_map::Entry::Vacant(_) => { @@ -435,7 +445,10 @@ async fn start_inner( } ControlRequest::WaitForSpawn { dataflow_id } => { if let Some(dataflow) = running_dataflows.get_mut(&dataflow_id) { - dataflow.spawn_result_tx.push(reply_sender); + dataflow.spawn_result.register(reply_sender); + } else { + let _ = + reply_sender.send(Err(eyre!("unknown dataflow {dataflow_id}"))); } } ControlRequest::Check { dataflow_uuid } => { @@ -508,7 +521,7 @@ async fn start_inner( match dataflow { Ok(dataflow) => { - dataflow.reply_senders.push(reply_sender); + dataflow.stop_reply_senders.push(reply_sender); } Err(err) => { let _ = reply_sender.send(Err(err)); @@ -541,7 +554,7 @@ async fn start_inner( match dataflow { Ok(dataflow) => { - dataflow.reply_senders.push(reply_sender); + dataflow.stop_reply_senders.push(reply_sender); } Err(err) => { let _ = reply_sender.send(Err(err)); @@ -734,12 +747,10 @@ async fn start_inner( "spawned" } ); - for reply_tx in dataflow.spawn_result_tx.drain(..) { - let _ = - reply_tx.send(Ok(ControlRequestReply::DataflowSpawned { - uuid: dataflow_id, - })); - } + dataflow.spawn_result.set_result(Ok( + ControlRequestReply::DataflowSpawned { uuid: dataflow_id }, + )); + if dataflow.build_only { running_dataflows.remove(&dataflow_id); } @@ -747,9 +758,7 @@ async fn start_inner( } Err(err) => { tracing::warn!("error while spawning dataflow `{dataflow_id}`"); - for reply_tx in dataflow.spawn_result_tx.drain(..) { - let _ = reply_tx.send(Err(eyre!(format!("{err:?}")))); - } + dataflow.spawn_result.set_result(Err(err)); } }; } @@ -860,16 +869,70 @@ struct RunningDataflow { exited_before_subscribe: Vec, nodes: BTreeMap, - reply_senders: Vec>>, + spawn_result: SpawnResult, + stop_reply_senders: Vec>>, log_subscribers: Vec, pending_spawn_results: BTreeSet, - spawn_result_tx: Vec>>, build_only: bool, } +pub enum SpawnResult { + Pending { + result_senders: Vec>>, + }, + Spawned { + result: eyre::Result, + }, +} + +impl Default for SpawnResult { + fn default() -> Self { + Self::Pending { + result_senders: Vec::new(), + } + } +} + +impl SpawnResult { + fn register( + &mut self, + reply_sender: tokio::sync::oneshot::Sender>, + ) { + match self { + SpawnResult::Pending { result_senders } => result_senders.push(reply_sender), + SpawnResult::Spawned { result } => { + Self::send_result_to(result, reply_sender); + } + } + } + + fn set_result(&mut self, result: eyre::Result) { + match self { + SpawnResult::Pending { result_senders } => { + for sender in result_senders.drain(..) { + Self::send_result_to(&result, sender); + } + *self = SpawnResult::Spawned { result }; + } + SpawnResult::Spawned { .. } => {} + } + } + + fn send_result_to( + result: &eyre::Result, + sender: oneshot::Sender>, + ) { + let result = match result { + Ok(r) => Ok(r.clone()), + Err(err) => Err(eyre!("{err:?}")), + }; + let _ = sender.send(result); + } +} + struct ArchivedDataflow { name: Option, nodes: BTreeMap, @@ -1093,10 +1156,10 @@ async fn start_dataflow( exited_before_subscribe: Default::default(), daemons: daemons.clone(), nodes, - reply_senders: Vec::new(), + spawn_result: SpawnResult::default(), + stop_reply_senders: Vec::new(), log_subscribers: Vec::new(), pending_spawn_results: daemons, - spawn_result_tx: Vec::new(), build_only, }) } From 4c22c16570e8156832e7d882b64b6ab87b6c1bd2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 29 Apr 2025 10:58:38 +0200 Subject: [PATCH 050/101] Subscribe to log messages while waiting for dataflow spawning --- binaries/cli/src/lib.rs | 103 +++++++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index f1636273..1bbc6cc0 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -1,6 +1,8 @@ -use attach::attach_dataflow; +use attach::{attach_dataflow, print_log_message}; use colored::Colorize; -use communication_layer_request_reply::{RequestReplyLayer, TcpLayer, TcpRequestReplyConnection}; +use communication_layer_request_reply::{ + RequestReplyLayer, TcpConnection, TcpLayer, TcpRequestReplyConnection, +}; use dora_coordinator::Event; use dora_core::{ descriptor::{source_is_url, Descriptor, DescriptorExt}, @@ -13,6 +15,7 @@ use dora_daemon::Daemon; use dora_download::download_file; use dora_message::{ cli_to_coordinator::ControlRequest, + common::LogMessage, coordinator_to_cli::{ControlRequestReply, DataflowList, DataflowResult, DataflowStatus}, }; #[cfg(feature = "tracing")] @@ -21,7 +24,11 @@ use dora_tracing::{set_up_tracing_opts, FileLogging}; use duration_str::parse; use eyre::{bail, Context}; use formatting::FormatDataflowError; -use std::{env::current_dir, io::Write, net::SocketAddr}; +use std::{ + env::current_dir, + io::Write, + net::{SocketAddr, TcpStream}, +}; use std::{ net::{IpAddr, Ipv4Addr}, path::PathBuf, @@ -377,10 +384,17 @@ fn run(args: Args) -> eyre::Result<()> { coordinator_port, uv, } => { - let (_, _, _, mut session, uuid) = - start_dataflow(dataflow, None, coordinator_addr, coordinator_port, uv, true)?; + let coordinator_socket = (coordinator_addr, coordinator_port).into(); + let (_, _, mut session, uuid) = + start_dataflow(dataflow, None, coordinator_socket, uv, true)?; // wait until build is finished - wait_until_dataflow_started(uuid, &mut session, true)?; + wait_until_dataflow_started( + uuid, + &mut session, + true, + coordinator_socket, + log::LevelFilter::Info, + )?; } Command::New { args, @@ -413,7 +427,8 @@ fn run(args: Args) -> eyre::Result<()> { let name = if uuid.is_some() { None } else { Some(dataflow) }; logs::logs(&mut *session, uuid, name, node)? } else { - let active = list.get_active(); + let active: Vec = + list.get_active(); let uuid = match &active[..] { [] => bail!("No dataflows are running"), [uuid] => uuid.clone(), @@ -432,15 +447,9 @@ fn run(args: Args) -> eyre::Result<()> { hot_reload, uv, } => { - let (dataflow, dataflow_descriptor, coordinator_socket, mut session, dataflow_id) = - start_dataflow( - dataflow, - name, - coordinator_addr, - coordinator_port, - uv, - false, - )?; + let coordinator_socket = (coordinator_addr, coordinator_port).into(); + let (dataflow, dataflow_descriptor, mut session, dataflow_id) = + start_dataflow(dataflow, name, coordinator_socket, uv, false)?; let attach = match (attach, detach) { (true, true) => eyre::bail!("both `--attach` and `--detach` are given"), @@ -464,7 +473,13 @@ fn run(args: Args) -> eyre::Result<()> { )? } else { // wait until dataflow is started - wait_until_dataflow_started(dataflow_id, &mut session, false)?; + wait_until_dataflow_started( + dataflow_id, + &mut session, + false, + coordinator_socket, + log::LevelFilter::Info, + )?; } } Command::List { @@ -620,20 +635,10 @@ fn run(args: Args) -> eyre::Result<()> { fn start_dataflow( dataflow: String, name: Option, - coordinator_addr: IpAddr, - coordinator_port: u16, + coordinator_socket: SocketAddr, uv: bool, build_only: bool, -) -> Result< - ( - PathBuf, - Descriptor, - SocketAddr, - Box, - Uuid, - ), - eyre::Error, -> { +) -> Result<(PathBuf, Descriptor, Box, Uuid), eyre::Error> { let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; let dataflow_descriptor = Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; @@ -643,7 +648,6 @@ fn start_dataflow( .parent() .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? .to_owned(); - let coordinator_socket = (coordinator_addr, coordinator_port).into(); let mut session = connect_to_coordinator(coordinator_socket) .wrap_err("failed to connect to dora coordinator")?; let dataflow_id = { @@ -677,20 +681,45 @@ fn start_dataflow( other => bail!("unexpected start dataflow reply: {other:?}"), } }; - Ok(( - dataflow, - dataflow_descriptor, - coordinator_socket, - session, - dataflow_id, - )) + Ok((dataflow, dataflow_descriptor, session, dataflow_id)) } fn wait_until_dataflow_started( dataflow_id: Uuid, session: &mut Box, build_only: bool, + coordinator_addr: SocketAddr, + log_level: log::LevelFilter, ) -> eyre::Result<()> { + // subscribe to log messages + let mut log_session = TcpConnection { + stream: TcpStream::connect(coordinator_addr) + .wrap_err("failed to connect to dora coordinator")?, + }; + log_session + .send( + &serde_json::to_vec(&ControlRequest::LogSubscribe { + dataflow_id, + level: log_level, + }) + .wrap_err("failed to serialize message")?, + ) + .wrap_err("failed to send log subscribe request to coordinator")?; + std::thread::spawn(move || { + while let Ok(raw) = log_session.receive() { + let parsed: eyre::Result = + serde_json::from_slice(&raw).context("failed to parse log message"); + match parsed { + Ok(log_message) => { + print_log_message(log_message); + } + Err(err) => { + tracing::warn!("failed to parse log message: {err:?}") + } + } + } + }); + let reply_raw = session .request(&serde_json::to_vec(&ControlRequest::WaitForSpawn { dataflow_id }).unwrap()) .wrap_err("failed to send start dataflow message")?; From e897f78024c864e35e1c2bc7d029bbc5366d27d8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 29 Apr 2025 11:03:46 +0200 Subject: [PATCH 051/101] Add extra error cause when node fails to spawn for better error messages --- binaries/daemon/src/lib.rs | 14 +++++--------- libraries/message/src/common.rs | 5 +++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 7abe7ec5..9130b4cc 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -885,9 +885,7 @@ impl Daemon { node_id.clone(), Err(NodeError { timestamp: self.clock.new_timestamp(), - cause: NodeErrorCause::Other { - stderr: format!("spawn failed: {err:?}"), - }, + cause: NodeErrorCause::FailedToSpawn(format!("{err:?}")), exit_status: NodeExitStatus::Unknown, }), ); @@ -993,9 +991,9 @@ impl Daemon { } let node_err: NodeError = NodeError { timestamp: clock.new_timestamp(), - cause: NodeErrorCause::Other { - stderr: format!("preparing for spawn failed: {err:?}"), - }, + cause: NodeErrorCause::FailedToSpawn(format!( + "preparing for spawn failed: {err:?}" + )), exit_status: NodeExitStatus::Unknown, }; let send_result = events_tx.send(node_result(node_id, Err(node_err))).await; @@ -1047,9 +1045,7 @@ impl Daemon { Err(err) => { let node_err = NodeError { timestamp: clock.new_timestamp(), - cause: NodeErrorCause::Other { - stderr: format!("spawn failed: {err:?}"), - }, + cause: NodeErrorCause::FailedToSpawn(format!("spawn failed: {err:?}")), exit_status: NodeExitStatus::Unknown, }; if spawn_result.is_ok() { diff --git a/libraries/message/src/common.rs b/libraries/message/src/common.rs index 93e2f8d9..015b163e 100644 --- a/libraries/message/src/common.rs +++ b/libraries/message/src/common.rs @@ -32,6 +32,9 @@ pub struct NodeError { impl std::fmt::Display for NodeError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let NodeErrorCause::FailedToSpawn(err) = &self.cause { + return write!(f, "failed to spawn node: {err}"); + } match &self.exit_status { NodeExitStatus::Success => write!(f, ""), NodeExitStatus::IoError(err) => write!(f, "I/O error while reading exit status: {err}"), @@ -68,6 +71,7 @@ impl std::fmt::Display for NodeError { f, ". This error occurred because node `{caused_by_node}` exited before connecting to dora." )?, + NodeErrorCause::FailedToSpawn(_) => unreachable!(), // handled above NodeErrorCause::Other { stderr } if stderr.is_empty() => {} NodeErrorCause::Other { stderr } => { let line: &str = "---------------------------------------------------------------------------------\n"; @@ -88,6 +92,7 @@ pub enum NodeErrorCause { Cascading { caused_by_node: NodeId, }, + FailedToSpawn(String), Other { stderr: String, }, From 9f7c0959981be4f7031bd4b68e8c7bf31d1e8d55 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 29 Apr 2025 11:35:01 +0200 Subject: [PATCH 052/101] Don't error if dynamic node stops after dataflow is done We don't wait for dynamic nodes when the dataflow is done otherwise, so they might stop after the dataflow is done. This commit fixes a daemon error that happened in this case. --- binaries/daemon/src/lib.rs | 84 ++++++++++++++++++++++++-------- binaries/daemon/src/spawn/mod.rs | 6 +++ 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 9130b4cc..364ca6ae 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -106,6 +106,12 @@ pub struct Daemon { type DaemonRunResult = BTreeMap>>; +struct NodePrepareTask { + node_id: NodeId, + dynamic_node: bool, + task: F, +} + impl Daemon { pub async fn run( coordinator_addr: SocketAddr, @@ -393,6 +399,7 @@ impl Daemon { Event::SpawnNodeResult { dataflow_id, node_id, + dynamic_node, result, } => match result { Ok(running_node) => { @@ -407,7 +414,8 @@ impl Daemon { .entry(dataflow_id) .or_default() .insert(node_id.clone(), Err(error)); - self.handle_node_stop(dataflow_id, &node_id).await?; + self.handle_node_stop(dataflow_id, &node_id, dynamic_node) + .await?; } }, Event::SpawnDataflowResult { @@ -845,7 +853,8 @@ impl Daemon { let mut logger = logger.reborrow().for_node(node.id.clone()); let local = spawn_nodes.contains(&node.id); if local { - if node.kind.dynamic() { + let dynamic_node = node.kind.dynamic(); + if dynamic_node { dataflow.dynamic_nodes.insert(node.id.clone()); } else { dataflow.pending_nodes.insert(node.id.clone()); @@ -872,7 +881,11 @@ impl Daemon { .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { Ok(result) => { - tasks.push((node_id, result)); + tasks.push(NodePrepareTask { + node_id, + task: result, + dynamic_node, + }); } Err(err) => { logger @@ -889,7 +902,7 @@ impl Daemon { exit_status: NodeExitStatus::Unknown, }), ); - stopped.push(node_id.clone()); + stopped.push((node_id.clone(), dynamic_node)); } } } else { @@ -947,8 +960,9 @@ impl Daemon { } } } - for node_id in stopped { - self.handle_node_stop(dataflow_id, &node_id).await?; + for (node_id, dynamic) in stopped { + self.handle_node_stop(dataflow_id, &node_id, dynamic) + .await?; } let spawn_result = Self::spawn_prepared_nodes( @@ -965,24 +979,27 @@ impl Daemon { async fn spawn_prepared_nodes( dataflow_id: Uuid, mut logger: DataflowLogger<'_>, - tasks: Vec<( - NodeId, - impl Future>, - )>, + tasks: Vec>>>, events_tx: mpsc::Sender>, clock: Arc, ) -> eyre::Result<()> { - let node_result = |node_id, result| Timestamped { + let node_result = |node_id, dynamic_node, result| Timestamped { inner: Event::SpawnNodeResult { dataflow_id, node_id, + dynamic_node, result, }, timestamp: clock.new_timestamp(), }; let mut failed_to_prepare = None; let mut prepared_nodes = Vec::new(); - for (node_id, task) in tasks { + for task in tasks { + let NodePrepareTask { + node_id, + dynamic_node, + task, + } = task; match task.await { Ok(node) => prepared_nodes.push(node), Err(err) => { @@ -996,7 +1013,9 @@ impl Daemon { )), exit_status: NodeExitStatus::Unknown, }; - let send_result = events_tx.send(node_result(node_id, Err(node_err))).await; + let send_result = events_tx + .send(node_result(node_id, dynamic_node, Err(node_err))) + .await; if send_result.is_err() { tracing::error!("failed to send SpawnNodeResult to main daemon task") } @@ -1016,7 +1035,11 @@ impl Daemon { exit_status: NodeExitStatus::Unknown, }; let send_result = events_tx - .send(node_result(node.node_id().clone(), Err(err))) + .send(node_result( + node.node_id().clone(), + node.dynamic(), + Err(err), + )) .await; if send_result.is_err() { tracing::error!("failed to send SpawnNodeResult to main daemon task") @@ -1038,6 +1061,7 @@ impl Daemon { // spawn the nodes for node in prepared_nodes { let node_id = node.node_id().clone(); + let dynamic_node = node.dynamic(); let mut logger = logger.reborrow().for_node(node_id.clone()); let result = node.spawn(&mut logger).await; let node_spawn_result = match result { @@ -1055,7 +1079,7 @@ impl Daemon { } }; let send_result = events_tx - .send(node_result(node_id, node_spawn_result)) + .send(node_result(node_id, dynamic_node, node_spawn_result)) .await; if send_result.is_err() { tracing::error!("failed to send SpawnNodeResult to main daemon task") @@ -1486,11 +1510,27 @@ impl Daemon { Ok(()) } - async fn handle_node_stop(&mut self, dataflow_id: Uuid, node_id: &NodeId) -> eyre::Result<()> { + async fn handle_node_stop( + &mut self, + dataflow_id: Uuid, + node_id: &NodeId, + dynamic_node: bool, + ) -> eyre::Result<()> { let mut logger = self.logger.for_dataflow(dataflow_id); - let dataflow = self.running.get_mut(&dataflow_id).wrap_err_with(|| { - format!("failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`") - })?; + let dataflow = match self.running.get_mut(&dataflow_id) { + Some(dataflow) => dataflow, + None if dynamic_node => { + // The dataflow might be done already as we don't wait for dynamic nodes. In this + // case, we don't need to do anything to handle the node stop. + tracing::debug!( + "dynamic node {dataflow_id}/{node_id} stopped after dataflow was done" + ); + return Ok(()); + } + None => eyre::bail!( + "failed to get downstream nodes: no running dataflow with ID `{dataflow_id}`" + ), + }; dataflow .pending_nodes @@ -1652,6 +1692,7 @@ impl Daemon { DoraEvent::SpawnedNodeResult { dataflow_id, node_id, + dynamic_node, exit_status, } => { let mut logger = self @@ -1739,7 +1780,8 @@ impl Daemon { .or_default() .insert(node_id.clone(), node_result); - self.handle_node_stop(dataflow_id, &node_id).await?; + self.handle_node_stop(dataflow_id, &node_id, dynamic_node) + .await?; if let Some(exit_when_done) = &mut self.exit_when_done { exit_when_done.remove(&(dataflow_id, node_id)); @@ -2227,6 +2269,7 @@ pub enum Event { SpawnNodeResult { dataflow_id: DataflowId, node_id: NodeId, + dynamic_node: bool, result: Result, }, SpawnDataflowResult { @@ -2306,6 +2349,7 @@ pub enum DoraEvent { SpawnedNodeResult { dataflow_id: DataflowId, node_id: NodeId, + dynamic_node: bool, exit_status: NodeExitStatus, }, } diff --git a/binaries/daemon/src/spawn/mod.rs b/binaries/daemon/src/spawn/mod.rs index 1d1eedaa..9bf15360 100644 --- a/binaries/daemon/src/spawn/mod.rs +++ b/binaries/daemon/src/spawn/mod.rs @@ -403,6 +403,10 @@ impl PreparedNode { &self.node.id } + pub fn dynamic(&self) -> bool { + self.node.kind.dynamic() + } + pub async fn spawn(mut self, logger: &mut NodeLogger<'_>) -> eyre::Result { let mut child = match &mut self.command { Some(command) => command.spawn().wrap_err(self.spawn_error_msg)?, @@ -555,6 +559,7 @@ impl PreparedNode { }); let node_id = self.node.id.clone(); + let dynamic_node = self.node.kind.dynamic(); let (log_finish_tx, log_finish_rx) = oneshot::channel(); let clock = self.clock.clone(); let daemon_tx = self.daemon_tx.clone(); @@ -566,6 +571,7 @@ impl PreparedNode { dataflow_id, node_id, exit_status, + dynamic_node, } .into(); let event = Timestamped { From 51ed0194c7cfce7ad079b2c24a135bfd4ae0c29f Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 6 Jun 2025 18:05:09 +0200 Subject: [PATCH 053/101] Rework and refactor for two-step build --- .gitignore | 4 +- Cargo.lock | 234 +++++++----- Cargo.toml | 3 + apis/python/node/Cargo.toml | 2 +- apis/python/node/src/lib.rs | 15 +- binaries/cli/Cargo.toml | 5 +- binaries/cli/src/command/build/distributed.rs | 107 ++++++ binaries/cli/src/command/build/git.rs | 45 +++ binaries/cli/src/command/build/local.rs | 101 +++++ binaries/cli/src/command/build/mod.rs | 162 ++++++++ binaries/cli/src/{ => command}/check.rs | 0 binaries/cli/src/{ => command}/logs.rs | 0 binaries/cli/src/command/mod.rs | 60 +++ binaries/cli/src/command/run.rs | 23 ++ .../cli/src/{ => command/start}/attach.rs | 38 +- binaries/cli/src/command/start/mod.rs | 167 +++++++++ binaries/cli/src/{ => command}/up.rs | 2 +- binaries/cli/src/lib.rs | 234 ++---------- binaries/cli/src/output.rs | 48 +++ binaries/cli/src/session.rs | 73 ++++ binaries/coordinator/src/control.rs | 30 +- binaries/coordinator/src/lib.rs | 327 +++++++++++++--- binaries/coordinator/src/listener.rs | 10 + binaries/coordinator/src/run/mod.rs | 26 +- binaries/daemon/Cargo.toml | 3 +- binaries/daemon/src/lib.rs | 326 +++++++++++++--- binaries/daemon/src/log.rs | 88 ++++- binaries/daemon/src/spawn/git.rs | 286 -------------- binaries/daemon/src/spawn/mod.rs | 121 +----- examples/benchmark/run.rs | 14 + examples/c++-dataflow/.gitignore | 1 + examples/c++-ros2-dataflow/.gitignore | 1 + examples/multiple-daemons/run.rs | 22 +- examples/python-ros2-dataflow/run.rs | 9 + examples/rust-dataflow-git/.gitignore | 2 + examples/rust-dataflow-git/dataflow.yml | 6 +- examples/rust-dataflow-git/run.rs | 13 + examples/rust-dataflow-url/.gitignore | 1 + examples/rust-dataflow-url/run.rs | 13 + examples/rust-dataflow/run.rs | 13 + libraries/core/Cargo.toml | 5 +- .../src/{build.rs => build/build_command.rs} | 0 libraries/core/src/build/git.rs | 353 ++++++++++++++++++ libraries/core/src/build/logger.rs | 15 + libraries/core/src/build/mod.rs | 139 +++++++ libraries/core/src/git.rs | 0 libraries/core/src/lib.rs | 1 + libraries/message/src/cli_to_coordinator.rs | 41 +- libraries/message/src/common.rs | 11 +- libraries/message/src/coordinator_to_cli.rs | 35 +- .../message/src/coordinator_to_daemon.rs | 36 +- .../message/src/daemon_to_coordinator.rs | 9 +- libraries/message/src/descriptor.rs | 6 + libraries/message/src/lib.rs | 35 ++ 54 files changed, 2437 insertions(+), 884 deletions(-) create mode 100644 binaries/cli/src/command/build/distributed.rs create mode 100644 binaries/cli/src/command/build/git.rs create mode 100644 binaries/cli/src/command/build/local.rs create mode 100644 binaries/cli/src/command/build/mod.rs rename binaries/cli/src/{ => command}/check.rs (100%) rename binaries/cli/src/{ => command}/logs.rs (100%) create mode 100644 binaries/cli/src/command/mod.rs create mode 100644 binaries/cli/src/command/run.rs rename binaries/cli/src/{ => command/start}/attach.rs (86%) create mode 100644 binaries/cli/src/command/start/mod.rs rename binaries/cli/src/{ => command}/up.rs (98%) create mode 100644 binaries/cli/src/output.rs create mode 100644 binaries/cli/src/session.rs delete mode 100644 binaries/daemon/src/spawn/git.rs create mode 100644 examples/rust-dataflow-git/.gitignore create mode 100644 examples/rust-dataflow-url/.gitignore rename libraries/core/src/{build.rs => build/build_command.rs} (100%) create mode 100644 libraries/core/src/build/git.rs create mode 100644 libraries/core/src/build/logger.rs create mode 100644 libraries/core/src/build/mod.rs create mode 100644 libraries/core/src/git.rs diff --git a/.gitignore b/.gitignore index 2bab6ed3..ade7dc46 100644 --- a/.gitignore +++ b/.gitignore @@ -34,7 +34,7 @@ __pycache__/ # Distribution / packaging .Python -build/ +/build/ develop-eggs/ dist/ downloads/ @@ -179,4 +179,4 @@ out/ #Miscellaneous yolo.yml -~* \ No newline at end of file +~* diff --git a/Cargo.lock b/Cargo.lock index 919ab7e4..9067383e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -851,7 +851,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror 1.0.69", - "time 0.3.41", + "time", ] [[package]] @@ -1415,7 +1415,7 @@ dependencies = [ "path_abs", "plist", "regex", - "semver 1.0.26", + "semver", "serde", "serde_yaml 0.9.34+deprecated", "shell-words", @@ -1823,7 +1823,7 @@ checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" dependencies = [ "camino", "cargo-platform", - "semver 1.0.26", + "semver", "serde", "serde_json", ] @@ -1836,7 +1836,7 @@ checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037" dependencies = [ "camino", "cargo-platform", - "semver 1.0.26", + "semver", "serde", "serde_json", "thiserror 1.0.69", @@ -2521,6 +2521,33 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96a6ac251f4a2aca6b3f91340350eab87ae57c3f127ffeb585e92bd336717991" +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "cxx" version = "1.0.149" @@ -2955,10 +2982,12 @@ dependencies = [ "dora-operator-api-c", "dora-runtime", "dora-tracing", + "dunce", "duration-str", "env_logger 0.11.7", "eyre", "futures", + "git2", "inquire", "log", "notify 5.2.0", @@ -3004,6 +3033,8 @@ dependencies = [ "dora-message", "dunce", "eyre", + "git2", + "itertools 0.14.0", "log", "once_cell", "schemars", @@ -3013,6 +3044,7 @@ dependencies = [ "serde_yaml 0.9.34+deprecated", "tokio", "tracing", + "url", "uuid 1.16.0", "which", ] @@ -3039,6 +3071,7 @@ dependencies = [ "futures", "futures-concurrency", "git2", + "itertools 0.14.0", "serde_json", "serde_yaml 0.8.26", "shared-memory-server", @@ -3067,6 +3100,7 @@ dependencies = [ name = "dora-examples" version = "0.0.0" dependencies = [ + "dora-cli", "dora-coordinator", "dora-core", "dora-download", @@ -3108,7 +3142,7 @@ dependencies = [ "log", "once_cell", "schemars", - "semver 1.0.26", + "semver", "serde", "serde-with-expand-env", "serde_yaml 0.9.34+deprecated", @@ -3196,7 +3230,7 @@ name = "dora-node-api-python" version = "0.3.10" dependencies = [ "arrow 54.2.1", - "dora-daemon", + "dora-cli", "dora-download", "dora-node-api", "dora-operator-api-python", @@ -3455,7 +3489,7 @@ dependencies = [ "rust_decimal", "serde", "thiserror 1.0.69", - "time 0.3.41", + "time", ] [[package]] @@ -3492,6 +3526,31 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18aade80d5e09429040243ce1143ddc08a92d7a22820ac512610410a4dd5214f" +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8 0.10.2", + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "signature 2.2.0", + "subtle", + "zeroize", +] + [[package]] name = "eframe" version = "0.31.1" @@ -4099,6 +4158,12 @@ dependencies = [ "anyhow", ] +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.25" @@ -5073,7 +5138,7 @@ dependencies = [ "dirs 5.0.1", "futures", "http 1.3.1", - "indicatif 0.17.11", + "indicatif", "libc", "log", "num_cpus", @@ -5204,7 +5269,7 @@ dependencies = [ "http 0.2.12", "http-serde", "serde", - "time 0.3.41", + "time", ] [[package]] @@ -5602,18 +5667,6 @@ dependencies = [ "serde", ] -[[package]] -name = "indicatif" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" -dependencies = [ - "console", - "lazy_static", - "number_prefix 0.3.0", - "regex", -] - [[package]] name = "indicatif" version = "0.17.11" @@ -5621,7 +5674,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ "console", - "number_prefix 0.4.0", + "number_prefix", "portable-atomic", "rayon", "unicode-width 0.2.0", @@ -6730,7 +6783,7 @@ dependencies = [ "hf-hub", "image", "indexmap 2.8.0", - "indicatif 0.17.11", + "indicatif", "interprocess", "itertools 0.13.0", "llguidance", @@ -7367,12 +7420,6 @@ dependencies = [ "libc", ] -[[package]] -name = "number_prefix" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" - [[package]] name = "number_prefix" version = "0.4.0" @@ -8382,7 +8429,7 @@ dependencies = [ "indexmap 2.8.0", "quick-xml 0.32.0", "serde", - "time 0.3.41", + "time", ] [[package]] @@ -8880,15 +8927,6 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" -[[package]] -name = "quick-xml" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26aab6b48e2590e4a64d1ed808749ba06257882b461d01ca71baeb747074a6dd" -dependencies = [ - "memchr", -] - [[package]] name = "quick-xml" version = "0.30.0" @@ -9238,7 +9276,7 @@ dependencies = [ "serde_json", "sha2", "thiserror 1.0.69", - "time 0.3.41", + "time", "url", "uuid 1.16.0", "web-sys", @@ -9325,7 +9363,7 @@ dependencies = [ "cargo_metadata 0.18.1", "glob", "sha2", - "time 0.3.41", + "time", "unindent", "walkdir", ] @@ -9760,7 +9798,7 @@ dependencies = [ "serde_bytes", "static_assertions", "thiserror 1.0.69", - "time 0.3.41", + "time", "typenum", "uuid 1.16.0", "web-time", @@ -10241,7 +10279,7 @@ dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", "sublime_fuzzy", - "time 0.3.41", + "time", "url", ] @@ -11327,7 +11365,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "semver 1.0.26", + "semver", ] [[package]] @@ -11743,34 +11781,39 @@ dependencies = [ "libc", ] +[[package]] +name = "self-replace" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03ec815b5eab420ab893f63393878d89c90fdd94c0bcc44c07abb8ad95552fb7" +dependencies = [ + "fastrand 2.3.0", + "tempfile", + "windows-sys 0.52.0", +] + [[package]] name = "self_update" -version = "0.27.0" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb85f1802f7b987237b8525c0fde86ea86f31c957c1875467c727d5b921179c" +checksum = "d832c086ece0dacc29fb2947bb4219b8f6e12fe9e40b7108f9e57c4224e47b5c" dependencies = [ "either", "flate2", - "hyper 0.14.32", - "indicatif 0.15.0", + "hyper 1.6.0", + "indicatif", "log", - "quick-xml 0.20.0", + "quick-xml 0.37.2", "regex", - "reqwest 0.11.27", - "semver 0.11.0", + "reqwest 0.12.15", + "self-replace", + "semver", "serde_json", "tar", "tempfile", - "zip 0.5.13", -] - -[[package]] -name = "semver" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", + "urlencoding", + "zip 2.4.2", + "zipsign-api", ] [[package]] @@ -11782,15 +11825,6 @@ dependencies = [ "serde", ] -[[package]] -name = "semver-parser" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9900206b54a3527fdc7b8a938bffd94a568bac4f4aa8113b209df75a09c0dec2" -dependencies = [ - "pest", -] - [[package]] name = "seq-macro" version = "0.3.6" @@ -11957,7 +11991,7 @@ dependencies = [ "serde_derive", "serde_json", "serde_with_macros", - "time 0.3.41", + "time", ] [[package]] @@ -13015,17 +13049,6 @@ dependencies = [ "weezl", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi 0.3.9", -] - [[package]] name = "time" version = "0.3.41" @@ -13188,7 +13211,7 @@ dependencies = [ "derive_builder", "esaxx-rs", "getrandom 0.2.15", - "indicatif 0.17.11", + "indicatif", "itertools 0.13.0", "lazy_static", "log", @@ -14074,12 +14097,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -15367,7 +15384,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror 1.0.69", - "time 0.3.41", + "time", ] [[package]] @@ -15979,7 +15996,7 @@ dependencies = [ "rustls 0.23.25", "rustls-webpki 0.102.8", "serde", - "time 0.3.41", + "time", "tokio", "tokio-util", "tracing", @@ -16030,7 +16047,7 @@ dependencies = [ "rustls-pki-types", "rustls-webpki 0.102.8", "secrecy", - "time 0.3.41", + "time", "tokio", "tokio-util", "tracing", @@ -16115,7 +16132,7 @@ dependencies = [ "rustls-webpki 0.102.8", "secrecy", "socket2 0.5.8", - "time 0.3.41", + "time", "tls-listener", "tokio", "tokio-rustls 0.26.2", @@ -16601,29 +16618,44 @@ dependencies = [ [[package]] name = "zip" -version = "0.5.13" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93ab48844d61251bb3835145c521d88aa4031d7139e8485990f60ca911fa0815" +checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164" dependencies = [ - "byteorder", + "arbitrary", "crc32fast", + "crossbeam-utils", + "displaydoc", + "indexmap 2.8.0", + "num_enum", "thiserror 1.0.69", - "time 0.1.45", ] [[package]] name = "zip" -version = "1.1.4" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" dependencies = [ "arbitrary", "crc32fast", "crossbeam-utils", "displaydoc", "indexmap 2.8.0", - "num_enum", - "thiserror 1.0.69", + "memchr", + "thiserror 2.0.12", + "time", +] + +[[package]] +name = "zipsign-api" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dba6063ff82cdbd9a765add16d369abe81e520f836054e997c2db217ceca40c0" +dependencies = [ + "base64 0.22.1", + "ed25519-dalek", + "thiserror 2.0.12", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a353bf17..d2489a52 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ dora-metrics = { version = "0.3.10", path = "libraries/extensions/telemetry/metr dora-download = { version = "0.3.10", path = "libraries/extensions/download" } shared-memory-server = { version = "0.3.10", path = "libraries/shared-memory-server" } communication-layer-request-reply = { version = "0.3.10", path = "libraries/communication-layer/request-reply" } +dora-cli = { version = "0.3.10", path = "binaries/cli" } dora-runtime = { version = "0.3.10", path = "binaries/runtime" } dora-daemon = { version = "0.3.10", path = "binaries/daemon" } dora-coordinator = { version = "0.3.10", path = "binaries/coordinator" } @@ -88,6 +89,7 @@ pyo3 = { version = "0.23", features = [ "multiple-pymethods", ] } pythonize = "0.23" +git2 = { version = "0.18.0", features = ["vendored-openssl"] } [package] name = "dora-examples" @@ -104,6 +106,7 @@ ros2-examples = [] [dev-dependencies] eyre = "0.6.8" tokio = "1.24.2" +dora-cli = { workspace = true } dora-coordinator = { workspace = true } dora-core = { workspace = true } dora-message = { workspace = true } diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index 54ebff5c..c06fbbaa 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -24,7 +24,7 @@ eyre = "0.6" serde_yaml = "0.8.23" flume = "0.10.14" dora-runtime = { workspace = true, features = ["tracing", "metrics", "python"] } -dora-daemon = { workspace = true } +dora-cli = { workspace = true } dora-download = { workspace = true } arrow = { workspace = true, features = ["pyarrow"] } pythonize = { workspace = true } diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index e2a249a9..2d3634cd 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -6,7 +6,6 @@ use std::sync::Arc; use std::time::Duration; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; -use dora_daemon::Daemon; use dora_download::download_file; use dora_node_api::dora_core::config::NodeId; use dora_node_api::dora_core::descriptor::source_is_url; @@ -382,19 +381,7 @@ pub fn resolve_dataflow(dataflow: String) -> eyre::Result { #[pyfunction] #[pyo3(signature = (dataflow_path, uv=None))] pub fn run(dataflow_path: String, uv: Option) -> eyre::Result<()> { - let dataflow_path = resolve_dataflow(dataflow_path).context("could not resolve dataflow")?; - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .context("tokio runtime failed")?; - let result = rt.block_on(Daemon::run_dataflow(&dataflow_path, uv.unwrap_or_default()))?; - match result.is_ok() { - true => Ok(()), - false => Err(eyre::eyre!( - "Dataflow failed to run with error: {:?}", - result.node_results - )), - } + dora_cli::command::run(dataflow_path, uv.unwrap_or_default()) } #[pymodule] diff --git a/binaries/cli/Cargo.toml b/binaries/cli/Cargo.toml index 20d42015..96b67a76 100644 --- a/binaries/cli/Cargo.toml +++ b/binaries/cli/Cargo.toml @@ -50,7 +50,7 @@ tabwriter = "1.4.0" log = { version = "0.4.21", features = ["serde"] } colored = "2.1.0" env_logger = "0.11.3" -self_update = { version = "0.27.0", features = [ +self_update = { version = "0.42.0", features = [ "rustls", "archive-zip", "archive-tar", @@ -60,7 +60,8 @@ pyo3 = { workspace = true, features = [ "extension-module", "abi3", ], optional = true } - +dunce = "1.0.5" +git2 = { workspace = true } [lib] name = "dora_cli" diff --git a/binaries/cli/src/command/build/distributed.rs b/binaries/cli/src/command/build/distributed.rs new file mode 100644 index 00000000..9e7fca67 --- /dev/null +++ b/binaries/cli/src/command/build/distributed.rs @@ -0,0 +1,107 @@ +use communication_layer_request_reply::{TcpConnection, TcpRequestReplyConnection}; +use dora_core::descriptor::Descriptor; +use dora_message::{ + cli_to_coordinator::ControlRequest, + common::{GitSource, LogMessage}, + coordinator_to_cli::ControlRequestReply, + id::NodeId, + BuildId, +}; +use eyre::{bail, Context}; +use std::{ + collections::BTreeMap, + net::{SocketAddr, TcpStream}, +}; + +use crate::{output::print_log_message, session::DataflowSession}; + +pub fn build_distributed_dataflow( + session: &mut TcpRequestReplyConnection, + dataflow: Descriptor, + git_sources: &BTreeMap, + dataflow_session: &DataflowSession, + local_working_dir: Option, + uv: bool, +) -> eyre::Result { + let build_id = { + let reply_raw = session + .request( + &serde_json::to_vec(&ControlRequest::Build { + session_id: dataflow_session.session_id, + dataflow, + git_sources: git_sources.clone(), + prev_git_sources: dataflow_session.git_sources.clone(), + local_working_dir, + uv, + }) + .unwrap(), + ) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowBuildTriggered { build_id } => { + eprintln!("dataflow build triggered: {build_id}"); + build_id + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } + }; + Ok(build_id) +} + +pub fn wait_until_dataflow_built( + build_id: BuildId, + session: &mut TcpRequestReplyConnection, + coordinator_socket: SocketAddr, + log_level: log::LevelFilter, +) -> eyre::Result { + // subscribe to log messages + let mut log_session = TcpConnection { + stream: TcpStream::connect(coordinator_socket) + .wrap_err("failed to connect to dora coordinator")?, + }; + log_session + .send( + &serde_json::to_vec(&ControlRequest::BuildLogSubscribe { + build_id, + level: log_level, + }) + .wrap_err("failed to serialize message")?, + ) + .wrap_err("failed to send build log subscribe request to coordinator")?; + std::thread::spawn(move || { + while let Ok(raw) = log_session.receive() { + let parsed: eyre::Result = + serde_json::from_slice(&raw).context("failed to parse log message"); + match parsed { + Ok(log_message) => { + print_log_message(log_message); + } + Err(err) => { + tracing::warn!("failed to parse log message: {err:?}") + } + } + } + }); + + let reply_raw = session + .request(&serde_json::to_vec(&ControlRequest::WaitForBuild { build_id }).unwrap()) + .wrap_err("failed to send WaitForBuild message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowBuildFinished { build_id, result } => match result { + Ok(()) => { + eprintln!("dataflow build finished successfully"); + Ok(build_id) + } + Err(err) => bail!("{err}"), + }, + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } +} diff --git a/binaries/cli/src/command/build/git.rs b/binaries/cli/src/command/build/git.rs new file mode 100644 index 00000000..18faba87 --- /dev/null +++ b/binaries/cli/src/command/build/git.rs @@ -0,0 +1,45 @@ +use dora_message::{common::GitSource, descriptor::GitRepoRev}; +use eyre::Context; + +pub fn fetch_commit_hash(repo_url: String, rev: Option) -> eyre::Result { + let mut remote = git2::Remote::create_detached(repo_url.as_bytes()) + .with_context(|| format!("failed to create git remote for {repo_url}"))?; + let connection = remote + .connect_auth(git2::Direction::Fetch, None, None) + .with_context(|| format!("failed to open connection to {repo_url}"))?; + let references = connection + .list() + .with_context(|| format!("failed to list git references of {repo_url}"))?; + + let expected_name = match &rev { + Some(GitRepoRev::Branch(branch)) => format!("refs/heads/{branch}"), + Some(GitRepoRev::Tag(tag)) => format!("refs/tags/{tag}"), + Some(GitRepoRev::Rev(rev)) => rev.clone(), + None => "HEAD".into(), + }; + + let mut commit_hash = None; + for head in references { + if head.name() == expected_name { + commit_hash = Some(head.oid().to_string()); + break; + } + } + + if commit_hash.is_none() { + if let Some(GitRepoRev::Rev(rev)) = &rev { + // rev might be a commit hash instead of a reference + if rev.is_ascii() && rev.bytes().all(|b| b.is_ascii_alphanumeric()) { + commit_hash = Some(rev.clone()); + } + } + } + + match commit_hash { + Some(commit_hash) => Ok(GitSource { + repo: repo_url, + commit_hash, + }), + None => eyre::bail!("no matching commit for `{rev:?}`"), + } +} diff --git a/binaries/cli/src/command/build/local.rs b/binaries/cli/src/command/build/local.rs new file mode 100644 index 00000000..ac28eeca --- /dev/null +++ b/binaries/cli/src/command/build/local.rs @@ -0,0 +1,101 @@ +use std::{collections::BTreeMap, path::PathBuf}; + +use dora_core::{ + build::{BuildInfo, BuildLogger, Builder, GitManager}, + descriptor::{Descriptor, DescriptorExt}, +}; +use dora_message::{common::GitSource, id::NodeId}; +use eyre::Context; + +use crate::session::DataflowSession; + +pub fn build_dataflow_locally( + dataflow: Descriptor, + git_sources: &BTreeMap, + dataflow_session: &DataflowSession, + working_dir: PathBuf, + uv: bool, +) -> eyre::Result { + let runtime = tokio::runtime::Runtime::new()?; + + runtime.block_on(build_dataflow( + dataflow, + git_sources, + dataflow_session, + working_dir, + uv, + )) +} + +async fn build_dataflow( + dataflow: Descriptor, + git_sources: &BTreeMap, + dataflow_session: &DataflowSession, + base_working_dir: PathBuf, + uv: bool, +) -> eyre::Result { + let builder = Builder { + session_id: dataflow_session.session_id, + base_working_dir, + uv, + }; + let nodes = dataflow.resolve_aliases_and_set_defaults()?; + + let mut git_manager = GitManager::default(); + let prev_git_sources = &dataflow_session.git_sources; + + let mut tasks = Vec::new(); + + // build nodes + for node in nodes.into_values() { + let node_id = node.id.clone(); + let git_source = git_sources.get(&node_id).cloned(); + let prev_git_source = prev_git_sources.get(&node_id).cloned(); + + let task = builder + .clone() + .build_node( + node, + git_source, + prev_git_source, + LocalBuildLogger { + node_id: node_id.clone(), + }, + &mut git_manager, + ) + .await + .wrap_err_with(|| format!("failed to build node `{node_id}`"))?; + tasks.push((node_id, task)); + } + + let mut info = BuildInfo { + node_working_dirs: Default::default(), + }; + for (node_id, task) in tasks { + let node = task + .await + .with_context(|| format!("failed to build node `{node_id}`"))?; + info.node_working_dirs + .insert(node_id, node.node_working_dir); + } + Ok(info) +} + +struct LocalBuildLogger { + node_id: NodeId, +} + +impl BuildLogger for LocalBuildLogger { + type Clone = Self; + + async fn log_message(&mut self, level: log::Level, message: impl Into + Send) { + let message: String = message.into(); + println!("{}: \t{level}: \t{message}", self.node_id); + } + + async fn try_clone(&self) -> eyre::Result { + Ok(LocalBuildLogger { + node_id: self.node_id.clone(), + }) + } +} diff --git a/binaries/cli/src/command/build/mod.rs b/binaries/cli/src/command/build/mod.rs new file mode 100644 index 00000000..fff1d452 --- /dev/null +++ b/binaries/cli/src/command/build/mod.rs @@ -0,0 +1,162 @@ +use communication_layer_request_reply::TcpRequestReplyConnection; +use dora_core::{ + descriptor::{CoreNodeKind, CustomNode, Descriptor, DescriptorExt}, + topics::{DORA_COORDINATOR_PORT_CONTROL_DEFAULT, LOCALHOST}, +}; +use dora_message::{descriptor::NodeSource, BuildId}; +use eyre::Context; +use std::collections::BTreeMap; + +use crate::{connect_to_coordinator, resolve_dataflow, session::DataflowSession}; + +use distributed::{build_distributed_dataflow, wait_until_dataflow_built}; +use local::build_dataflow_locally; + +mod distributed; +mod git; +mod local; + +pub fn build( + dataflow: String, + coordinator_addr: Option, + coordinator_port: Option, + uv: bool, + force_local: bool, +) -> eyre::Result<()> { + let dataflow_path = resolve_dataflow(dataflow).context("could not resolve dataflow")?; + let dataflow_descriptor = + Descriptor::blocking_read(&dataflow_path).wrap_err("Failed to read yaml dataflow")?; + let mut dataflow_session = + DataflowSession::read_session(&dataflow_path).context("failed to read DataflowSession")?; + + let mut git_sources = BTreeMap::new(); + let resolved_nodes = dataflow_descriptor + .resolve_aliases_and_set_defaults() + .context("failed to resolve nodes")?; + for (node_id, node) in resolved_nodes { + if let CoreNodeKind::Custom(CustomNode { + source: NodeSource::GitBranch { repo, rev }, + .. + }) = node.kind + { + let source = git::fetch_commit_hash(repo, rev) + .with_context(|| format!("failed to find commit hash for `{node_id}`"))?; + git_sources.insert(node_id, source); + } + } + + let session = connect_to_coordinator_with_defaults(coordinator_addr, coordinator_port); + + let build_kind = if force_local { + // user explicitly requested a local build + BuildKind::Local + } else if coordinator_addr.is_some() || coordinator_port.is_some() { + // explicit coordinator address or port set -> there should be a coordinator running + BuildKind::ThroughCoordinator { + coordinator_session: session.context("failed to connect to coordinator")?, + } + } else { + match session { + Ok(coordinator_session) => { + // we found a local coordinator instance at default port -> use it for building + BuildKind::ThroughCoordinator { + coordinator_session, + } + } + Err(_) => { + // no coordinator instance found -> do a local build + BuildKind::Local + } + } + }; + + match build_kind { + BuildKind::Local => { + println!("running local build"); + // use dataflow dir as base working dir + let local_working_dir = dunce::canonicalize(&dataflow_path) + .context("failed to canonicalize dataflow path")? + .parent() + .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? + .to_owned(); + let build_info = build_dataflow_locally( + dataflow_descriptor, + &git_sources, + &dataflow_session, + local_working_dir, + uv, + )?; + + dataflow_session.git_sources = git_sources; + // generate a random BuildId and store the associated build info + dataflow_session.build_id = Some(BuildId::generate()); + dataflow_session.local_build = Some(build_info); + dataflow_session + .write_out_for_dataflow(&dataflow_path) + .context("failed to write out dataflow session file")?; + } + BuildKind::ThroughCoordinator { + mut coordinator_session, + } => { + let local_working_dir = super::local_working_dir( + &dataflow_path, + &dataflow_descriptor, + &mut *coordinator_session, + )?; + let build_id = build_distributed_dataflow( + &mut *coordinator_session, + dataflow_descriptor, + &git_sources, + &dataflow_session, + local_working_dir, + uv, + )?; + + dataflow_session.git_sources = git_sources; + dataflow_session + .write_out_for_dataflow(&dataflow_path) + .context("failed to write out dataflow session file")?; + + // wait until dataflow build is finished + + wait_until_dataflow_built( + build_id, + &mut *coordinator_session, + coordinator_socket(coordinator_addr, coordinator_port), + log::LevelFilter::Info, + )?; + + dataflow_session.build_id = Some(build_id); + dataflow_session.local_build = None; + dataflow_session + .write_out_for_dataflow(&dataflow_path) + .context("failed to write out dataflow session file")?; + } + }; + + Ok(()) +} + +enum BuildKind { + Local, + ThroughCoordinator { + coordinator_session: Box, + }, +} + +fn connect_to_coordinator_with_defaults( + coordinator_addr: Option, + coordinator_port: Option, +) -> std::io::Result> { + let coordinator_socket = coordinator_socket(coordinator_addr, coordinator_port); + connect_to_coordinator(coordinator_socket) +} + +fn coordinator_socket( + coordinator_addr: Option, + coordinator_port: Option, +) -> std::net::SocketAddr { + let coordinator_addr = coordinator_addr.unwrap_or(LOCALHOST); + let coordinator_port = coordinator_port.unwrap_or(DORA_COORDINATOR_PORT_CONTROL_DEFAULT); + (coordinator_addr, coordinator_port).into() +} diff --git a/binaries/cli/src/check.rs b/binaries/cli/src/command/check.rs similarity index 100% rename from binaries/cli/src/check.rs rename to binaries/cli/src/command/check.rs diff --git a/binaries/cli/src/logs.rs b/binaries/cli/src/command/logs.rs similarity index 100% rename from binaries/cli/src/logs.rs rename to binaries/cli/src/command/logs.rs diff --git a/binaries/cli/src/command/mod.rs b/binaries/cli/src/command/mod.rs new file mode 100644 index 00000000..77654440 --- /dev/null +++ b/binaries/cli/src/command/mod.rs @@ -0,0 +1,60 @@ +pub use build::build; +pub use logs::logs; +pub use run::run; +pub use start::start; + +use std::path::{Path, PathBuf}; + +use communication_layer_request_reply::TcpRequestReplyConnection; +use dora_core::descriptor::Descriptor; +use dora_message::{cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply}; +use eyre::{bail, Context, ContextCompat}; + +mod build; +pub mod check; +mod logs; +mod run; +mod start; +pub mod up; + +fn local_working_dir( + dataflow_path: &Path, + dataflow_descriptor: &Descriptor, + coordinator_session: &mut TcpRequestReplyConnection, +) -> eyre::Result> { + Ok( + if dataflow_descriptor + .nodes + .iter() + .all(|n| n.deploy.machine.is_none()) + && cli_and_daemon_on_same_machine(coordinator_session)? + { + Some( + dunce::canonicalize(dataflow_path) + .context("failed to canonicalize dataflow file path")? + .parent() + .context("dataflow path has no parent dir")? + .to_owned(), + ) + } else { + None + }, + ) +} + +fn cli_and_daemon_on_same_machine(session: &mut TcpRequestReplyConnection) -> eyre::Result { + let reply_raw = session + .request(&serde_json::to_vec(&ControlRequest::CliAndDefaultDaemonOnSameMachine).unwrap()) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::CliAndDefaultDaemonIps { + default_daemon, + cli, + } => Ok(default_daemon.is_some() && default_daemon == cli), + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } +} diff --git a/binaries/cli/src/command/run.rs b/binaries/cli/src/command/run.rs new file mode 100644 index 00000000..df01d16e --- /dev/null +++ b/binaries/cli/src/command/run.rs @@ -0,0 +1,23 @@ +use dora_daemon::Daemon; +use eyre::Context; +use tokio::runtime::Builder; + +use crate::{handle_dataflow_result, resolve_dataflow, session::DataflowSession}; + +pub fn run(dataflow: String, uv: bool) -> Result<(), eyre::Error> { + let dataflow_path = resolve_dataflow(dataflow).context("could not resolve dataflow")?; + let dataflow_session = + DataflowSession::read_session(&dataflow_path).context("failed to read DataflowSession")?; + let rt = Builder::new_multi_thread() + .enable_all() + .build() + .context("tokio runtime failed")?; + let result = rt.block_on(Daemon::run_dataflow( + &dataflow_path, + dataflow_session.build_id, + dataflow_session.local_build, + dataflow_session.session_id, + uv, + ))?; + handle_dataflow_result(result, None) +} diff --git a/binaries/cli/src/attach.rs b/binaries/cli/src/command/start/attach.rs similarity index 86% rename from binaries/cli/src/attach.rs rename to binaries/cli/src/command/start/attach.rs index e40be1d8..05d776e0 100644 --- a/binaries/cli/src/attach.rs +++ b/binaries/cli/src/command/start/attach.rs @@ -1,4 +1,3 @@ -use colored::Colorize; use communication_layer_request_reply::{TcpConnection, TcpRequestReplyConnection}; use dora_core::descriptor::{resolve_path, CoreNodeKind, Descriptor, DescriptorExt}; use dora_message::cli_to_coordinator::ControlRequest; @@ -16,6 +15,7 @@ use tracing::{error, info}; use uuid::Uuid; use crate::handle_dataflow_result; +use crate::output::print_log_message; pub fn attach_dataflow( dataflow: Descriptor, @@ -183,42 +183,6 @@ pub fn attach_dataflow( } } -pub fn print_log_message(log_message: LogMessage) { - let LogMessage { - dataflow_id, - node_id, - daemon_id, - level, - target, - module_path: _, - file: _, - line: _, - message, - } = log_message; - let level = match level { - log::Level::Error => "ERROR".red(), - log::Level::Warn => "WARN ".yellow(), - log::Level::Info => "INFO ".green(), - other => format!("{other:5}").normal(), - }; - let dataflow = format!(" dataflow `{dataflow_id}`").cyan(); - let daemon = match daemon_id { - Some(id) => format!(" on daemon `{id}`"), - None => " on default daemon".to_string(), - } - .bright_black(); - let node = match node_id { - Some(node_id) => format!(" {node_id}").bold(), - None => "".normal(), - }; - let target = match target { - Some(target) => format!(" {target}").dimmed(), - None => "".normal(), - }; - - println!("{level}{dataflow}{daemon}{node}{target}: {message}"); -} - enum AttachEvent { Control(ControlRequest), Log(eyre::Result), diff --git a/binaries/cli/src/command/start/mod.rs b/binaries/cli/src/command/start/mod.rs new file mode 100644 index 00000000..5275a62d --- /dev/null +++ b/binaries/cli/src/command/start/mod.rs @@ -0,0 +1,167 @@ +use communication_layer_request_reply::{TcpConnection, TcpRequestReplyConnection}; +use dora_core::descriptor::{Descriptor, DescriptorExt}; +use dora_message::{ + cli_to_coordinator::ControlRequest, common::LogMessage, coordinator_to_cli::ControlRequestReply, +}; +use eyre::{bail, Context}; +use std::{ + net::{SocketAddr, TcpStream}, + path::PathBuf, +}; +use uuid::Uuid; + +use crate::{ + connect_to_coordinator, output::print_log_message, resolve_dataflow, session::DataflowSession, +}; +use attach::attach_dataflow; + +mod attach; + +pub fn start( + dataflow: String, + name: Option, + coordinator_socket: SocketAddr, + attach: bool, + detach: bool, + hot_reload: bool, + uv: bool, +) -> eyre::Result<()> { + let (dataflow, dataflow_descriptor, mut session, dataflow_id) = + start_dataflow(dataflow, name, coordinator_socket, uv)?; + + let attach = match (attach, detach) { + (true, true) => eyre::bail!("both `--attach` and `--detach` are given"), + (true, false) => true, + (false, true) => false, + (false, false) => { + println!("attaching to dataflow (use `--detach` to run in background)"); + true + } + }; + + if attach { + let log_level = env_logger::Builder::new() + .filter_level(log::LevelFilter::Info) + .parse_default_env() + .build() + .filter(); + + attach_dataflow( + dataflow_descriptor, + dataflow, + dataflow_id, + &mut *session, + hot_reload, + coordinator_socket, + log_level, + ) + } else { + // wait until dataflow is started + wait_until_dataflow_started( + dataflow_id, + &mut session, + coordinator_socket, + log::LevelFilter::Info, + ) + } +} + +fn start_dataflow( + dataflow: String, + name: Option, + coordinator_socket: SocketAddr, + uv: bool, +) -> Result<(PathBuf, Descriptor, Box, Uuid), eyre::Error> { + let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; + let dataflow_descriptor = + Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; + let dataflow_session = + DataflowSession::read_session(&dataflow).context("failed to read DataflowSession")?; + + let mut session = connect_to_coordinator(coordinator_socket) + .wrap_err("failed to connect to dora coordinator")?; + + let local_working_dir = + super::local_working_dir(&dataflow, &dataflow_descriptor, &mut *session)?; + + let dataflow_id = { + let dataflow = dataflow_descriptor.clone(); + let session: &mut TcpRequestReplyConnection = &mut *session; + let reply_raw = session + .request( + &serde_json::to_vec(&ControlRequest::Start { + build_id: dataflow_session.build_id, + session_id: dataflow_session.session_id, + dataflow, + name, + local_working_dir, + uv, + }) + .unwrap(), + ) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowStartTriggered { uuid } => { + eprintln!("dataflow start triggered: {uuid}"); + uuid + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } + }; + Ok((dataflow, dataflow_descriptor, session, dataflow_id)) +} + +fn wait_until_dataflow_started( + dataflow_id: Uuid, + session: &mut Box, + coordinator_addr: SocketAddr, + log_level: log::LevelFilter, +) -> eyre::Result<()> { + // subscribe to log messages + let mut log_session = TcpConnection { + stream: TcpStream::connect(coordinator_addr) + .wrap_err("failed to connect to dora coordinator")?, + }; + log_session + .send( + &serde_json::to_vec(&ControlRequest::LogSubscribe { + dataflow_id, + level: log_level, + }) + .wrap_err("failed to serialize message")?, + ) + .wrap_err("failed to send log subscribe request to coordinator")?; + std::thread::spawn(move || { + while let Ok(raw) = log_session.receive() { + let parsed: eyre::Result = + serde_json::from_slice(&raw).context("failed to parse log message"); + match parsed { + Ok(log_message) => { + print_log_message(log_message); + } + Err(err) => { + tracing::warn!("failed to parse log message: {err:?}") + } + } + } + }); + + let reply_raw = session + .request(&serde_json::to_vec(&ControlRequest::WaitForSpawn { dataflow_id }).unwrap()) + .wrap_err("failed to send start dataflow message")?; + + let result: ControlRequestReply = + serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; + match result { + ControlRequestReply::DataflowSpawned { uuid } => { + eprintln!("dataflow started: {uuid}"); + } + ControlRequestReply::Error(err) => bail!("{err}"), + other => bail!("unexpected start dataflow reply: {other:?}"), + } + Ok(()) +} diff --git a/binaries/cli/src/up.rs b/binaries/cli/src/command/up.rs similarity index 98% rename from binaries/cli/src/up.rs rename to binaries/cli/src/command/up.rs index 16f1a4c1..03eead4a 100644 --- a/binaries/cli/src/up.rs +++ b/binaries/cli/src/command/up.rs @@ -1,4 +1,4 @@ -use crate::{check::daemon_running, connect_to_coordinator, LOCALHOST}; +use crate::{command::check::daemon_running, connect_to_coordinator, LOCALHOST}; use dora_core::topics::DORA_COORDINATOR_PORT_CONTROL_DEFAULT; use dora_message::{cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply}; use eyre::{bail, Context, ContextCompat}; diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index 1bbc6cc0..aeaca232 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -1,8 +1,5 @@ -use attach::{attach_dataflow, print_log_message}; use colored::Colorize; -use communication_layer_request_reply::{ - RequestReplyLayer, TcpConnection, TcpLayer, TcpRequestReplyConnection, -}; +use communication_layer_request_reply::{RequestReplyLayer, TcpLayer, TcpRequestReplyConnection}; use dora_coordinator::Event; use dora_core::{ descriptor::{source_is_url, Descriptor, DescriptorExt}, @@ -15,7 +12,6 @@ use dora_daemon::Daemon; use dora_download::download_file; use dora_message::{ cli_to_coordinator::ControlRequest, - common::LogMessage, coordinator_to_cli::{ControlRequestReply, DataflowList, DataflowResult, DataflowStatus}, }; #[cfg(feature = "tracing")] @@ -24,11 +20,7 @@ use dora_tracing::{set_up_tracing_opts, FileLogging}; use duration_str::parse; use eyre::{bail, Context}; use formatting::FormatDataflowError; -use std::{ - env::current_dir, - io::Write, - net::{SocketAddr, TcpStream}, -}; +use std::{env::current_dir, io::Write, net::SocketAddr}; use std::{ net::{IpAddr, Ipv4Addr}, path::PathBuf, @@ -39,13 +31,12 @@ use tokio::runtime::Builder; use tracing::level_filters::LevelFilter; use uuid::Uuid; -mod attach; -mod check; +pub mod command; mod formatting; mod graph; -mod logs; +pub mod output; +pub mod session; mod template; -mod up; const LOCALHOST: IpAddr = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)); const LISTEN_WILDCARD: IpAddr = IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)); @@ -90,14 +81,17 @@ enum Command { #[clap(value_name = "PATH")] dataflow: String, /// Address of the dora coordinator - #[clap(long, value_name = "IP", default_value_t = LOCALHOST)] - coordinator_addr: IpAddr, + #[clap(long, value_name = "IP")] + coordinator_addr: Option, /// Port number of the coordinator control server - #[clap(long, value_name = "PORT", default_value_t = DORA_COORDINATOR_PORT_CONTROL_DEFAULT)] - coordinator_port: u16, + #[clap(long, value_name = "PORT")] + coordinator_port: Option, // Use UV to build nodes. #[clap(long, action)] uv: bool, + // Run build on local machine + #[clap(long, action)] + local: bool, }, /// Generate a new project or node. Choose the language between Rust, Python, C or C++. New { @@ -298,14 +292,14 @@ enum Lang { } pub fn lib_main(args: Args) { - if let Err(err) = run(args) { + if let Err(err) = run_cli(args) { eprintln!("\n\n{}", "[ERROR]".bold().red()); eprintln!("{err:?}"); std::process::exit(1); } } -fn run(args: Args) -> eyre::Result<()> { +fn run_cli(args: Args) -> eyre::Result<()> { #[cfg(feature = "tracing")] match &args.command { Command::Daemon { @@ -347,12 +341,6 @@ fn run(args: Args) -> eyre::Result<()> { } }; - let log_level = env_logger::Builder::new() - .filter_level(log::LevelFilter::Info) - .parse_default_env() - .build() - .filter(); - match args.command { Command::Check { dataflow, @@ -367,9 +355,9 @@ fn run(args: Args) -> eyre::Result<()> { .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? .to_owned(); Descriptor::blocking_read(&dataflow)?.check(&working_dir)?; - check::check_environment((coordinator_addr, coordinator_port).into())? + command::check::check_environment((coordinator_addr, coordinator_port).into())? } - None => check::check_environment((coordinator_addr, coordinator_port).into())?, + None => command::check::check_environment((coordinator_addr, coordinator_port).into())?, }, Command::Graph { dataflow, @@ -383,34 +371,15 @@ fn run(args: Args) -> eyre::Result<()> { coordinator_addr, coordinator_port, uv, - } => { - let coordinator_socket = (coordinator_addr, coordinator_port).into(); - let (_, _, mut session, uuid) = - start_dataflow(dataflow, None, coordinator_socket, uv, true)?; - // wait until build is finished - wait_until_dataflow_started( - uuid, - &mut session, - true, - coordinator_socket, - log::LevelFilter::Info, - )?; - } + local, + } => command::build(dataflow, coordinator_addr, coordinator_port, uv, local)?, Command::New { args, internal_create_with_path_dependencies, } => template::create(args, internal_create_with_path_dependencies)?, - Command::Run { dataflow, uv } => { - let dataflow_path = resolve_dataflow(dataflow).context("could not resolve dataflow")?; - let rt = Builder::new_multi_thread() - .enable_all() - .build() - .context("tokio runtime failed")?; - let result = rt.block_on(Daemon::run_dataflow(&dataflow_path, uv))?; - handle_dataflow_result(result, None)? - } + Command::Run { dataflow, uv } => command::run(dataflow, uv)?, Command::Up { config } => { - up::up(config.as_deref())?; + command::up::up(config.as_deref())?; } Command::Logs { dataflow, @@ -425,7 +394,7 @@ fn run(args: Args) -> eyre::Result<()> { if let Some(dataflow) = dataflow { let uuid = Uuid::parse_str(&dataflow).ok(); let name = if uuid.is_some() { None } else { Some(dataflow) }; - logs::logs(&mut *session, uuid, name, node)? + command::logs(&mut *session, uuid, name, node)? } else { let active: Vec = list.get_active(); @@ -434,7 +403,7 @@ fn run(args: Args) -> eyre::Result<()> { [uuid] => uuid.clone(), _ => inquire::Select::new("Choose dataflow to show logs:", active).prompt()?, }; - logs::logs(&mut *session, Some(uuid.uuid), None, node)? + command::logs(&mut *session, Some(uuid.uuid), None, node)? } } Command::Start { @@ -448,39 +417,15 @@ fn run(args: Args) -> eyre::Result<()> { uv, } => { let coordinator_socket = (coordinator_addr, coordinator_port).into(); - let (dataflow, dataflow_descriptor, mut session, dataflow_id) = - start_dataflow(dataflow, name, coordinator_socket, uv, false)?; - - let attach = match (attach, detach) { - (true, true) => eyre::bail!("both `--attach` and `--detach` are given"), - (true, false) => true, - (false, true) => false, - (false, false) => { - println!("attaching to dataflow (use `--detach` to run in background)"); - true - } - }; - - if attach { - attach_dataflow( - dataflow_descriptor, - dataflow, - dataflow_id, - &mut *session, - hot_reload, - coordinator_socket, - log_level, - )? - } else { - // wait until dataflow is started - wait_until_dataflow_started( - dataflow_id, - &mut session, - false, - coordinator_socket, - log::LevelFilter::Info, - )?; - } + command::start( + dataflow, + name, + coordinator_socket, + attach, + detach, + hot_reload, + uv, + )? } Command::List { coordinator_addr, @@ -510,7 +455,7 @@ fn run(args: Args) -> eyre::Result<()> { config, coordinator_addr, coordinator_port, - } => up::destroy( + } => command::up::destroy( config.as_deref(), (coordinator_addr, coordinator_port).into(), )?, @@ -560,8 +505,11 @@ fn run(args: Args) -> eyre::Result<()> { coordinator_addr ); } + let dataflow_session = + DataflowSession::read_session(&dataflow_path).context("failed to read DataflowSession")?; - let result = Daemon::run_dataflow(&dataflow_path, false).await?; + let result = Daemon::run_dataflow(&dataflow_path, + dataflow_session.build_id, dataflow_session.local_build, dataflow_session.session_id, false).await?; handle_dataflow_result(result, None) } None => { @@ -632,114 +580,6 @@ fn run(args: Args) -> eyre::Result<()> { Ok(()) } -fn start_dataflow( - dataflow: String, - name: Option, - coordinator_socket: SocketAddr, - uv: bool, - build_only: bool, -) -> Result<(PathBuf, Descriptor, Box, Uuid), eyre::Error> { - let dataflow = resolve_dataflow(dataflow).context("could not resolve dataflow")?; - let dataflow_descriptor = - Descriptor::blocking_read(&dataflow).wrap_err("Failed to read yaml dataflow")?; - let working_dir = dataflow - .canonicalize() - .context("failed to canonicalize dataflow path")? - .parent() - .ok_or_else(|| eyre::eyre!("dataflow path has no parent dir"))? - .to_owned(); - let mut session = connect_to_coordinator(coordinator_socket) - .wrap_err("failed to connect to dora coordinator")?; - let dataflow_id = { - let dataflow = dataflow_descriptor.clone(); - let session: &mut TcpRequestReplyConnection = &mut *session; - let reply_raw = session - .request( - &serde_json::to_vec(&ControlRequest::Start { - dataflow, - name, - local_working_dir: working_dir, - uv, - build_only, - }) - .unwrap(), - ) - .wrap_err("failed to send start dataflow message")?; - - let result: ControlRequestReply = - serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; - match result { - ControlRequestReply::DataflowStartTriggered { uuid } => { - if build_only { - eprintln!("dataflow build triggered"); - } else { - eprintln!("dataflow start triggered: {uuid}"); - } - uuid - } - ControlRequestReply::Error(err) => bail!("{err}"), - other => bail!("unexpected start dataflow reply: {other:?}"), - } - }; - Ok((dataflow, dataflow_descriptor, session, dataflow_id)) -} - -fn wait_until_dataflow_started( - dataflow_id: Uuid, - session: &mut Box, - build_only: bool, - coordinator_addr: SocketAddr, - log_level: log::LevelFilter, -) -> eyre::Result<()> { - // subscribe to log messages - let mut log_session = TcpConnection { - stream: TcpStream::connect(coordinator_addr) - .wrap_err("failed to connect to dora coordinator")?, - }; - log_session - .send( - &serde_json::to_vec(&ControlRequest::LogSubscribe { - dataflow_id, - level: log_level, - }) - .wrap_err("failed to serialize message")?, - ) - .wrap_err("failed to send log subscribe request to coordinator")?; - std::thread::spawn(move || { - while let Ok(raw) = log_session.receive() { - let parsed: eyre::Result = - serde_json::from_slice(&raw).context("failed to parse log message"); - match parsed { - Ok(log_message) => { - print_log_message(log_message); - } - Err(err) => { - tracing::warn!("failed to parse log message: {err:?}") - } - } - } - }); - - let reply_raw = session - .request(&serde_json::to_vec(&ControlRequest::WaitForSpawn { dataflow_id }).unwrap()) - .wrap_err("failed to send start dataflow message")?; - - let result: ControlRequestReply = - serde_json::from_slice(&reply_raw).wrap_err("failed to parse reply")?; - match result { - ControlRequestReply::DataflowSpawned { uuid } => { - if build_only { - eprintln!("dataflow build finished"); - } else { - eprintln!("dataflow started: {uuid}"); - } - } - ControlRequestReply::Error(err) => bail!("{err}"), - other => bail!("unexpected start dataflow reply: {other:?}"), - } - Ok(()) -} - fn stop_dataflow_interactive( grace_duration: Option, session: &mut TcpRequestReplyConnection, @@ -890,6 +730,8 @@ use pyo3::{ wrap_pyfunction, Bound, PyResult, Python, }; +use crate::session::DataflowSession; + #[cfg(feature = "python")] #[pyfunction] fn py_main(_py: Python) -> PyResult<()> { diff --git a/binaries/cli/src/output.rs b/binaries/cli/src/output.rs new file mode 100644 index 00000000..ad35ad67 --- /dev/null +++ b/binaries/cli/src/output.rs @@ -0,0 +1,48 @@ +use colored::Colorize; +use dora_message::common::LogMessage; + +pub fn print_log_message(log_message: LogMessage) { + let LogMessage { + build_id, + dataflow_id, + node_id, + daemon_id, + level, + target, + module_path: _, + file: _, + line: _, + message, + } = log_message; + let level = match level { + log::Level::Error => "ERROR".red(), + log::Level::Warn => "WARN ".yellow(), + log::Level::Info => "INFO ".green(), + other => format!("{other:5}").normal(), + }; + let dataflow = if let Some(dataflow_id) = dataflow_id { + format!(" dataflow `{dataflow_id}`").cyan() + } else { + String::new().cyan() + }; + let build = if let Some(build_id) = build_id { + format!(" build `{build_id}`").cyan() + } else { + String::new().cyan() + }; + let daemon = match daemon_id { + Some(id) => format!(" on daemon `{id}`"), + None => " on default daemon".to_string(), + } + .bright_black(); + let node = match node_id { + Some(node_id) => format!(" {node_id}").bold(), + None => "".normal(), + }; + let target = match target { + Some(target) => format!(" {target}").dimmed(), + None => "".normal(), + }; + + println!("{level}{build}{dataflow}{daemon}{node}{target}: {message}"); +} diff --git a/binaries/cli/src/session.rs b/binaries/cli/src/session.rs new file mode 100644 index 00000000..29609e54 --- /dev/null +++ b/binaries/cli/src/session.rs @@ -0,0 +1,73 @@ +use std::{ + collections::BTreeMap, + path::{Path, PathBuf}, +}; + +use dora_core::build::BuildInfo; +use dora_message::{common::GitSource, id::NodeId, BuildId, SessionId}; +use eyre::{Context, ContextCompat}; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct DataflowSession { + pub build_id: Option, + pub session_id: SessionId, + pub git_sources: BTreeMap, + pub local_build: Option, +} + +impl Default for DataflowSession { + fn default() -> Self { + Self { + build_id: None, + session_id: SessionId::generate(), + git_sources: Default::default(), + local_build: Default::default(), + } + } +} + +impl DataflowSession { + pub fn read_session(dataflow_path: &Path) -> eyre::Result { + let session_file = session_file_path(dataflow_path)?; + if session_file.exists() { + if let Ok(parsed) = deserialize(&session_file) { + return Ok(parsed); + } else { + tracing::warn!("failed to read dataflow session file, regenerating (you might need to run `dora build` again)"); + } + } + + let default_session = DataflowSession::default(); + default_session.write_out_for_dataflow(dataflow_path)?; + Ok(default_session) + } + + pub fn write_out_for_dataflow(&self, dataflow_path: &Path) -> eyre::Result<()> { + let session_file = session_file_path(dataflow_path)?; + std::fs::write(session_file, self.serialize()?) + .context("failed to write dataflow session file")?; + Ok(()) + } + + fn serialize(&self) -> eyre::Result { + serde_yaml::to_string(&self).context("failed to serialize dataflow session file") + } +} + +fn deserialize(session_file: &Path) -> eyre::Result { + std::fs::read_to_string(&session_file) + .context("failed to read DataflowSession file") + .and_then(|s| { + serde_yaml::from_str(&s).context("failed to deserialize DataflowSession file") + }) +} + +fn session_file_path(dataflow_path: &Path) -> eyre::Result { + let file_stem = dataflow_path + .file_stem() + .wrap_err("dataflow path has no file stem")? + .to_str() + .wrap_err("dataflow file stem is not valid utf-8")?; + let session_file = dataflow_path.with_file_name(format!("{file_stem}.dora-session.yaml")); + Ok(session_file) +} diff --git a/binaries/coordinator/src/control.rs b/binaries/coordinator/src/control.rs index c0e92417..446233a8 100644 --- a/binaries/coordinator/src/control.rs +++ b/binaries/coordinator/src/control.rs @@ -2,7 +2,9 @@ use crate::{ tcp_utils::{tcp_receive, tcp_send}, Event, }; -use dora_message::{cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply}; +use dora_message::{ + cli_to_coordinator::ControlRequest, coordinator_to_cli::ControlRequestReply, BuildId, +}; use eyre::{eyre, Context}; use futures::{ future::{self, Either}, @@ -79,6 +81,7 @@ async fn handle_requests( tx: mpsc::Sender, _finish_tx: mpsc::Sender<()>, ) { + let peer_addr = connection.peer_addr().ok(); loop { let next_request = tcp_receive(&mut connection).map(Either::Left); let coordinator_stopped = tx.closed().map(Either::Right); @@ -114,11 +117,29 @@ async fn handle_requests( break; } - let result = match request { + if let Ok(ControlRequest::BuildLogSubscribe { build_id, level }) = request { + let _ = tx + .send(ControlEvent::BuildLogSubscribe { + build_id, + level, + connection, + }) + .await; + break; + } + + let mut result = match request { Ok(request) => handle_request(request, &tx).await, Err(err) => Err(err), }; + if let Ok(ControlRequestReply::CliAndDefaultDaemonIps { cli, .. }) = &mut result { + if cli.is_none() { + // fill cli IP address in reply + *cli = peer_addr.map(|s| s.ip()); + } + } + let reply = result.unwrap_or_else(|err| ControlRequestReply::Error(format!("{err:?}"))); let serialized: Vec = match serde_json::to_vec(&reply).wrap_err("failed to serialize ControlRequestReply") { @@ -179,6 +200,11 @@ pub enum ControlEvent { level: log::LevelFilter, connection: TcpStream, }, + BuildLogSubscribe { + build_id: BuildId, + level: log::LevelFilter, + connection: TcpStream, + }, Error(eyre::Report), } diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 1302662c..7f5f7d3f 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -5,22 +5,27 @@ use crate::{ pub use control::ControlEvent; use dora_core::{ config::{NodeId, OperatorId}, + descriptor::DescriptorExt, uhlc::{self, HLC}, }; use dora_message::{ cli_to_coordinator::ControlRequest, - common::DaemonId, + common::{DaemonId, GitSource}, coordinator_to_cli::{ ControlRequestReply, DataflowIdAndName, DataflowList, DataflowListEntry, DataflowResult, DataflowStatus, LogLevel, LogMessage, }, - coordinator_to_daemon::{DaemonCoordinatorEvent, RegisterResult, Timestamped}, + coordinator_to_daemon::{ + BuildDataflowNodes, DaemonCoordinatorEvent, RegisterResult, Timestamped, + }, daemon_to_coordinator::{DaemonCoordinatorReply, DataflowDaemonResult}, descriptor::{Descriptor, ResolvedNode}, + BuildId, DataflowId, SessionId, }; use eyre::{bail, eyre, ContextCompat, Result, WrapErr}; use futures::{future::join_all, stream::FuturesUnordered, Future, Stream, StreamExt}; use futures_concurrency::stream::Merge; +use itertools::Itertools; use log_subscriber::LogSubscriber; use run::SpawnedDataflow; use std::{ @@ -139,6 +144,10 @@ impl DaemonConnections { } } + fn get(&self, id: &DaemonId) -> Option<&DaemonConnection> { + self.daemons.get(id) + } + fn get_mut(&mut self, id: &DaemonId) -> Option<&mut DaemonConnection> { self.daemons.get_mut(id) } @@ -194,10 +203,12 @@ async fn start_inner( let mut events = (abortable_events, daemon_events).merge(); - let mut running_dataflows: HashMap = HashMap::new(); - let mut dataflow_results: HashMap> = + let mut running_builds: HashMap = HashMap::new(); + + let mut running_dataflows: HashMap = HashMap::new(); + let mut dataflow_results: HashMap> = HashMap::new(); - let mut archived_dataflows: HashMap = HashMap::new(); + let mut archived_dataflows: HashMap = HashMap::new(); let mut daemon_connections = DaemonConnections::default(); while let Some(event) = events.next().await { @@ -351,9 +362,10 @@ async fn start_inner( let mut finished_dataflow = entry.remove(); let dataflow_id = finished_dataflow.uuid; send_log_message( - &mut finished_dataflow, + &mut finished_dataflow.log_subscribers, &LogMessage { - dataflow_id, + build_id: None, + dataflow_id: Some(dataflow_id), node_id: None, daemon_id: None, level: LogLevel::Info, @@ -380,7 +392,7 @@ async fn start_inner( } if !matches!( finished_dataflow.spawn_result, - SpawnResult::Spawned { .. } + CachedResult::Cached { .. } ) { log::error!("pending spawn result on dataflow finish"); } @@ -399,12 +411,56 @@ async fn start_inner( reply_sender, } => { match request { + ControlRequest::Build { + session_id, + dataflow, + git_sources, + prev_git_sources, + local_working_dir, + uv, + } => { + // assign a random build id + let build_id = BuildId::generate(); + + let result = build_dataflow( + build_id, + session_id, + dataflow, + git_sources, + prev_git_sources, + local_working_dir, + &clock, + uv, + &mut daemon_connections, + ) + .await; + match result { + Ok(build) => { + running_builds.insert(build_id, build); + let _ = reply_sender.send(Ok( + ControlRequestReply::DataflowBuildTriggered { build_id }, + )); + } + Err(err) => { + let _ = reply_sender.send(Err(err)); + } + } + } + ControlRequest::WaitForBuild { build_id } => { + if let Some(build) = running_builds.get_mut(&build_id) { + build.build_result.register(reply_sender); + } else { + let _ = + reply_sender.send(Err(eyre!("unknown build id {build_id}"))); + } + } ControlRequest::Start { + build_id, + session_id, dataflow, name, local_working_dir, uv, - build_only, } => { let name = name.or_else(|| names::Generator::default().next()); @@ -419,13 +475,14 @@ async fn start_inner( } } let dataflow = start_dataflow( + build_id, + session_id, dataflow, local_working_dir, name, &mut daemon_connections, &clock, uv, - build_only, ) .await?; Ok(dataflow) @@ -652,6 +709,27 @@ async fn start_inner( "LogSubscribe request should be handled separately" ))); } + ControlRequest::BuildLogSubscribe { .. } => { + let _ = reply_sender.send(Err(eyre::eyre!( + "BuildLogSubscribe request should be handled separately" + ))); + } + ControlRequest::CliAndDefaultDaemonOnSameMachine => { + let mut default_daemon_ip = None; + if let Some(default_id) = daemon_connections.unnamed().next() { + if let Some(connection) = daemon_connections.get(default_id) { + if let Ok(addr) = connection.stream.peer_addr() { + default_daemon_ip = Some(addr.ip()); + } + } + } + let _ = reply_sender.send(Ok( + ControlRequestReply::CliAndDefaultDaemonIps { + default_daemon: default_daemon_ip, + cli: None, // filled later + }, + )); + } } } ControlEvent::Error(err) => tracing::error!("{err:?}"), @@ -666,6 +744,17 @@ async fn start_inner( .push(LogSubscriber::new(level, connection)); } } + ControlEvent::BuildLogSubscribe { + build_id, + level, + connection, + } => { + if let Some(build) = running_builds.get_mut(&build_id) { + build + .log_subscribers + .push(LogSubscriber::new(level, connection)); + } + } }, Event::DaemonHeartbeatInterval => { let mut disconnected = BTreeSet::new(); @@ -721,14 +810,52 @@ async fn start_inner( } } Event::Log(message) => { - if let Some(dataflow) = running_dataflows.get_mut(&message.dataflow_id) { - send_log_message(dataflow, &message).await; + if let Some(dataflow_id) = &message.dataflow_id { + if let Some(dataflow) = running_dataflows.get_mut(dataflow_id) { + send_log_message(&mut dataflow.log_subscribers, &message).await; + } + } + if let Some(build_id) = message.build_id { + if let Some(build) = running_builds.get_mut(&build_id) { + send_log_message(&mut build.log_subscribers, &message).await; + } } } Event::DaemonExit { daemon_id } => { tracing::info!("Daemon `{daemon_id}` exited"); daemon_connections.remove(&daemon_id); } + Event::DataflowBuildResult { + build_id, + daemon_id, + result, + } => match running_builds.get_mut(&build_id) { + Some(build) => { + build.pending_build_results.remove(&daemon_id); + match result { + Ok(()) => {} + Err(err) => { + build.errors.push(format!("{err:?}")); + } + }; + if build.pending_build_results.is_empty() { + tracing::info!("dataflow build finished: `{build_id}`"); + let mut build = running_builds.remove(&build_id).unwrap(); + let result = if build.errors.is_empty() { + Ok(()) + } else { + Err(format!("build failed: {}", build.errors.join("\n\n"))) + }; + + build.build_result.set_result(Ok( + ControlRequestReply::DataflowBuildFinished { build_id, result }, + )); + } + } + None => { + tracing::warn!("received DataflowSpawnResult, but no matching dataflow in `running_dataflows` map"); + } + }, Event::DataflowSpawnResult { dataflow_id, daemon_id, @@ -739,21 +866,10 @@ async fn start_inner( match result { Ok(()) => { if dataflow.pending_spawn_results.is_empty() { - tracing::info!( - "successfully {} dataflow `{dataflow_id}`", - if dataflow.build_only { - "built" - } else { - "spawned" - } - ); + tracing::info!("successfully spawned dataflow `{dataflow_id}`",); dataflow.spawn_result.set_result(Ok( ControlRequestReply::DataflowSpawned { uuid: dataflow_id }, )); - - if dataflow.build_only { - running_dataflows.remove(&dataflow_id); - } } } Err(err) => { @@ -783,8 +899,8 @@ async fn start_inner( Ok(()) } -async fn send_log_message(dataflow: &mut RunningDataflow, message: &LogMessage) { - for subscriber in &mut dataflow.log_subscribers { +async fn send_log_message(log_subscribers: &mut Vec, message: &LogMessage) { + for subscriber in log_subscribers.iter_mut() { let send_result = tokio::time::timeout(Duration::from_millis(100), subscriber.send_message(message)); @@ -792,7 +908,7 @@ async fn send_log_message(dataflow: &mut RunningDataflow, message: &LogMessage) subscriber.close(); } } - dataflow.log_subscribers.retain(|s| !s.is_closed()); + log_subscribers.retain(|s| !s.is_closed()); } fn dataflow_result( @@ -859,6 +975,15 @@ async fn send_heartbeat_message( .wrap_err("failed to send heartbeat message to daemon") } +struct RunningBuild { + errors: Vec, + build_result: CachedResult, + + log_subscribers: Vec, + + pending_build_results: BTreeSet, +} + struct RunningDataflow { name: Option, uuid: Uuid, @@ -869,26 +994,24 @@ struct RunningDataflow { exited_before_subscribe: Vec, nodes: BTreeMap, - spawn_result: SpawnResult, + spawn_result: CachedResult, stop_reply_senders: Vec>>, log_subscribers: Vec, pending_spawn_results: BTreeSet, - - build_only: bool, } -pub enum SpawnResult { +pub enum CachedResult { Pending { result_senders: Vec>>, }, - Spawned { + Cached { result: eyre::Result, }, } -impl Default for SpawnResult { +impl Default for CachedResult { fn default() -> Self { Self::Pending { result_senders: Vec::new(), @@ -896,14 +1019,14 @@ impl Default for SpawnResult { } } -impl SpawnResult { +impl CachedResult { fn register( &mut self, reply_sender: tokio::sync::oneshot::Sender>, ) { match self { - SpawnResult::Pending { result_senders } => result_senders.push(reply_sender), - SpawnResult::Spawned { result } => { + CachedResult::Pending { result_senders } => result_senders.push(reply_sender), + CachedResult::Cached { result } => { Self::send_result_to(result, reply_sender); } } @@ -911,13 +1034,13 @@ impl SpawnResult { fn set_result(&mut self, result: eyre::Result) { match self { - SpawnResult::Pending { result_senders } => { + CachedResult::Pending { result_senders } => { for sender in result_senders.drain(..) { Self::send_result_to(&result, sender); } - *self = SpawnResult::Spawned { result }; + *self = CachedResult::Cached { result }; } - SpawnResult::Spawned { .. } => {} + CachedResult::Cached { .. } => {} } } @@ -1123,26 +1246,135 @@ async fn retrieve_logs( reply_logs.map_err(|err| eyre!(err)) } +#[allow(clippy::too_many_arguments)] +#[tracing::instrument(skip(daemon_connections, clock))] +async fn build_dataflow( + build_id: BuildId, + session_id: SessionId, + dataflow: Descriptor, + git_sources: BTreeMap, + prev_git_sources: BTreeMap, + local_working_dir: Option, + clock: &HLC, + uv: bool, + daemon_connections: &mut DaemonConnections, +) -> eyre::Result { + let nodes = dataflow.resolve_aliases_and_set_defaults()?; + + let mut git_sources_by_daemon = git_sources + .into_iter() + .into_grouping_map_by(|(id, _)| nodes.get(id).and_then(|n| n.deploy.machine.as_ref())) + .collect(); + let mut prev_git_sources_by_daemon = prev_git_sources + .into_iter() + .into_grouping_map_by(|(id, _)| nodes.get(id).and_then(|n| n.deploy.machine.as_ref())) + .collect(); + + let nodes_by_daemon = nodes.values().into_group_map_by(|n| &n.deploy.machine); + + let mut daemons = BTreeSet::new(); + for (machine, nodes_on_machine) in &nodes_by_daemon { + let nodes_on_machine = nodes_on_machine.iter().map(|n| n.id.clone()).collect(); + tracing::debug!( + "Running dataflow build `{session_id}` on machine `{machine:?}` (nodes: {nodes_on_machine:?})" + ); + + let build_command = BuildDataflowNodes { + build_id, + session_id, + local_working_dir: local_working_dir.clone(), + git_sources: git_sources_by_daemon + .remove(&machine.as_ref()) + .unwrap_or_default(), + prev_git_sources: prev_git_sources_by_daemon + .remove(&machine.as_ref()) + .unwrap_or_default(), + dataflow_descriptor: dataflow.clone(), + nodes_on_machine, + uv, + }; + let message = serde_json::to_vec(&Timestamped { + inner: DaemonCoordinatorEvent::Build(build_command), + timestamp: clock.new_timestamp(), + })?; + + let daemon_id = build_dataflow_on_machine(daemon_connections, machine.as_deref(), &message) + .await + .wrap_err_with(|| format!("failed to build dataflow on machine `{machine:?}`"))?; + daemons.insert(daemon_id); + } + + tracing::info!("successfully triggered dataflow build `{session_id}`",); + + Ok(RunningBuild { + errors: Vec::new(), + build_result: CachedResult::default(), + log_subscribers: Vec::new(), + pending_build_results: daemons, + }) +} + +async fn build_dataflow_on_machine( + daemon_connections: &mut DaemonConnections, + machine: Option<&str>, + message: &[u8], +) -> Result { + let daemon_id = match machine { + Some(machine) => daemon_connections + .get_matching_daemon_id(machine) + .wrap_err_with(|| format!("no matching daemon for machine id {machine:?}"))? + .clone(), + None => daemon_connections + .unnamed() + .next() + .wrap_err("no unnamed daemon connections")? + .clone(), + }; + + let daemon_connection = daemon_connections + .get_mut(&daemon_id) + .wrap_err_with(|| format!("no daemon connection for daemon `{daemon_id}`"))?; + tcp_send(&mut daemon_connection.stream, message) + .await + .wrap_err("failed to send build message to daemon")?; + + let reply_raw = tcp_receive(&mut daemon_connection.stream) + .await + .wrap_err("failed to receive build reply from daemon")?; + match serde_json::from_slice(&reply_raw) + .wrap_err("failed to deserialize build reply from daemon")? + { + DaemonCoordinatorReply::TriggerBuildResult(result) => result + .map_err(|e| eyre!(e)) + .wrap_err("daemon returned an error")?, + _ => bail!("unexpected reply"), + } + Ok(daemon_id) +} + +#[allow(clippy::too_many_arguments)] async fn start_dataflow( + build_id: Option, + session_id: SessionId, dataflow: Descriptor, - working_dir: PathBuf, + local_working_dir: Option, name: Option, daemon_connections: &mut DaemonConnections, clock: &HLC, uv: bool, - build_only: bool, ) -> eyre::Result { let SpawnedDataflow { uuid, daemons, nodes, } = spawn_dataflow( + build_id, + session_id, dataflow, - working_dir, + local_working_dir, daemon_connections, clock, uv, - build_only, ) .await?; Ok(RunningDataflow { @@ -1156,11 +1388,10 @@ async fn start_dataflow( exited_before_subscribe: Default::default(), daemons: daemons.clone(), nodes, - spawn_result: SpawnResult::default(), + spawn_result: CachedResult::default(), stop_reply_senders: Vec::new(), log_subscribers: Vec::new(), pending_spawn_results: daemons, - build_only, }) } @@ -1235,6 +1466,11 @@ pub enum Event { DaemonExit { daemon_id: dora_message::common::DaemonId, }, + DataflowBuildResult { + build_id: BuildId, + daemon_id: DaemonId, + result: eyre::Result<()>, + }, DataflowSpawnResult { dataflow_id: uuid::Uuid, daemon_id: DaemonId, @@ -1264,6 +1500,7 @@ impl Event { Event::CtrlC => "CtrlC", Event::Log(_) => "Log", Event::DaemonExit { .. } => "DaemonExit", + Event::DataflowBuildResult { .. } => "DataflowBuildResult", Event::DataflowSpawnResult { .. } => "DataflowSpawnResult", } } diff --git a/binaries/coordinator/src/listener.rs b/binaries/coordinator/src/listener.rs index 39e17bca..ab7e3b9d 100644 --- a/binaries/coordinator/src/listener.rs +++ b/binaries/coordinator/src/listener.rs @@ -112,6 +112,16 @@ pub async fn handle_connection( break; } } + DaemonEvent::BuildResult { build_id, result } => { + let event = Event::DataflowBuildResult { + build_id, + daemon_id, + result: result.map_err(|err| eyre::eyre!(err)), + }; + if events_tx.send(event).await.is_err() { + break; + } + } DaemonEvent::SpawnResult { dataflow_id, result, diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index 425f0213..ca89fb87 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -10,6 +10,7 @@ use dora_message::{ daemon_to_coordinator::DaemonCoordinatorReply, descriptor::{Descriptor, ResolvedNode}, id::NodeId, + BuildId, SessionId, }; use eyre::{bail, eyre, ContextCompat, WrapErr}; use itertools::Itertools; @@ -21,12 +22,13 @@ use uuid::{NoContext, Timestamp, Uuid}; #[tracing::instrument(skip(daemon_connections, clock))] pub(super) async fn spawn_dataflow( + build_id: Option, + session_id: SessionId, dataflow: Descriptor, - working_dir: PathBuf, + local_working_dir: Option, daemon_connections: &mut DaemonConnections, clock: &HLC, uv: bool, - build_only: bool, ) -> eyre::Result { let nodes = dataflow.resolve_aliases_and_set_defaults()?; let uuid = Uuid::new_v7(Timestamp::now(NoContext)); @@ -37,18 +39,18 @@ pub(super) async fn spawn_dataflow( for (machine, nodes_on_machine) in &nodes_by_daemon { let spawn_nodes = nodes_on_machine.iter().map(|n| n.id.clone()).collect(); tracing::debug!( - "{} dataflow `{uuid}` on machine `{machine:?}` (nodes: {spawn_nodes:?})", - if build_only { "Building" } else { "Spawning" } + "Spawning dataflow `{uuid}` on machine `{machine:?}` (nodes: {spawn_nodes:?})" ); let spawn_command = SpawnDataflowNodes { + build_id, + session_id, dataflow_id: uuid, - working_dir: working_dir.clone(), + local_working_dir: local_working_dir.clone(), nodes: nodes.clone(), dataflow_descriptor: dataflow.clone(), spawn_nodes, uv, - build_only, }; let message = serde_json::to_vec(&Timestamped { inner: DaemonCoordinatorEvent::Spawn(spawn_command), @@ -57,19 +59,11 @@ pub(super) async fn spawn_dataflow( let daemon_id = spawn_dataflow_on_machine(daemon_connections, machine.as_deref(), &message) .await - .wrap_err_with(|| { - format!( - "failed to {} dataflow on machine `{machine:?}`", - if build_only { "build" } else { "spawn" } - ) - })?; + .wrap_err_with(|| format!("failed to spawn dataflow on machine `{machine:?}`"))?; daemons.insert(daemon_id); } - tracing::info!( - "successfully triggered dataflow {} `{uuid}`", - if build_only { "build" } else { "spawn" } - ); + tracing::info!("successfully triggered dataflow spawn `{uuid}`",); Ok(SpawnedDataflow { uuid, diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 6b9f7381..fdfd3596 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -45,5 +45,6 @@ crossbeam = "0.8.4" crossbeam-skiplist = "0.1.3" zenoh = "1.1.1" url = "2.5.4" -git2 = { version = "0.18.0", features = ["vendored-openssl"] } +git2 = { workspace = true } dunce = "1.0.5" +itertools = "0.14" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 364ca6ae..f7a16c9a 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -2,6 +2,7 @@ use aligned_vec::{AVec, ConstAlign}; use coordinator::CoordinatorEvent; use crossbeam::queue::ArrayQueue; use dora_core::{ + build::{self, BuildInfo, GitManager}, config::{DataId, Input, InputMapping, NodeId, NodeRunConfig, OperatorId}, descriptor::{ read_as_descriptor, CoreNodeKind, Descriptor, DescriptorExt, ResolvedNode, RuntimeNode, @@ -12,10 +13,11 @@ use dora_core::{ }; use dora_message::{ common::{ - DaemonId, DataMessage, DropToken, LogLevel, NodeError, NodeErrorCause, NodeExitStatus, + DaemonId, DataMessage, DropToken, GitSource, LogLevel, NodeError, NodeErrorCause, + NodeExitStatus, }, coordinator_to_cli::DataflowResult, - coordinator_to_daemon::{DaemonCoordinatorEvent, SpawnDataflowNodes}, + coordinator_to_daemon::{BuildDataflowNodes, DaemonCoordinatorEvent, SpawnDataflowNodes}, daemon_to_coordinator::{ CoordinatorRequest, DaemonCoordinatorReply, DaemonEvent, DataflowDaemonResult, }, @@ -24,7 +26,7 @@ use dora_message::{ descriptor::NodeSource, metadata::{self, ArrowTypeInfo}, node_to_daemon::{DynamicNodeEvent, Timestamped}, - DataflowId, + BuildId, DataflowId, SessionId, }; use dora_node_api::{arrow::datatypes::DataType, Parameter}; use eyre::{bail, eyre, Context, ContextCompat, Result}; @@ -38,6 +40,7 @@ use socket_stream_utils::socket_stream_send; use spawn::Spawner; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, + env::current_dir, future::Future, net::SocketAddr, path::{Path, PathBuf}, @@ -101,12 +104,14 @@ pub struct Daemon { logger: DaemonLogger, - repos_in_use: BTreeMap>, + sessions: BTreeMap, + builds: BTreeMap, + git_manager: GitManager, } type DaemonRunResult = BTreeMap>>; -struct NodePrepareTask { +struct NodeBuildTask { node_id: NodeId, dynamic_node: bool, task: F, @@ -148,12 +153,19 @@ impl Daemon { None, clock, Some(remote_daemon_events_tx), + Default::default(), ) .await .map(|_| ()) } - pub async fn run_dataflow(dataflow_path: &Path, uv: bool) -> eyre::Result { + pub async fn run_dataflow( + dataflow_path: &Path, + build_id: Option, + local_build: Option, + session_id: SessionId, + uv: bool, + ) -> eyre::Result { let working_dir = dataflow_path .canonicalize() .context("failed to canonicalize dataflow path")? @@ -167,13 +179,14 @@ impl Daemon { let dataflow_id = Uuid::new_v7(Timestamp::now(NoContext)); let spawn_command = SpawnDataflowNodes { + build_id, + session_id, dataflow_id, - working_dir, + local_working_dir: Some(working_dir), spawn_nodes: nodes.keys().cloned().collect(), nodes, dataflow_descriptor: descriptor, uv, - build_only: false, }; let clock = Arc::new(HLC::default()); @@ -204,6 +217,16 @@ impl Daemon { Some(exit_when_done), clock.clone(), None, + if let Some(local_build) = local_build { + let Some(build_id) = build_id else { + bail!("no build_id, but local_build set") + }; + let mut builds = BTreeMap::new(); + builds.insert(build_id, local_build); + builds + } else { + Default::default() + }, ); let spawn_result = reply_rx @@ -235,6 +258,7 @@ impl Daemon { exit_when_done: Option>, clock: Arc, remote_daemon_events_tx: Option>>>, + builds: BTreeMap, ) -> eyre::Result { let coordinator_connection = match coordinator_addr { Some(addr) => { @@ -298,7 +322,9 @@ impl Daemon { clock, zenoh_session, remote_daemon_events_tx, - repos_in_use: Default::default(), + git_manager: Default::default(), + builds, + sessions: Default::default(), }; let dora_events = ReceiverStream::new(dora_events_rx); @@ -418,14 +444,41 @@ impl Daemon { .await?; } }, + Event::BuildDataflowResult { + build_id, + session_id, + result, + } => { + let (build_info, result) = match result { + Ok(build_info) => (Some(build_info), Ok(())), + Err(err) => (None, Err(err)), + }; + if let Some(build_info) = build_info { + self.builds.insert(build_id, build_info); + if let Some(old_build_id) = self.sessions.insert(session_id, build_id) { + self.builds.remove(&old_build_id); + } + } + if let Some(connection) = &mut self.coordinator_connection { + let msg = serde_json::to_vec(&Timestamped { + inner: CoordinatorRequest::Event { + daemon_id: self.daemon_id.clone(), + event: DaemonEvent::BuildResult { + build_id, + result: result.map_err(|err| format!("{err:?}")), + }, + }, + timestamp: self.clock.new_timestamp(), + })?; + socket_stream_send(connection, &msg).await.wrap_err( + "failed to send BuildDataflowResult message to dora-coordinator", + )?; + } + } Event::SpawnDataflowResult { dataflow_id, result, - build_only, } => { - if build_only { - self.running.remove(&dataflow_id); - } if let Some(connection) = &mut self.coordinator_connection { let msg = serde_json::to_vec(&Timestamped { inner: CoordinatorRequest::Event { @@ -437,9 +490,9 @@ impl Daemon { }, timestamp: self.clock.new_timestamp(), })?; - socket_stream_send(connection, &msg) - .await - .wrap_err("failed to send Exit message to dora-coordinator")?; + socket_stream_send(connection, &msg).await.wrap_err( + "failed to send SpawnDataflowResult message to dora-coordinator", + )?; } } } @@ -476,35 +529,93 @@ impl Daemon { reply_tx: Sender>, ) -> eyre::Result { let status = match event { + DaemonCoordinatorEvent::Build(BuildDataflowNodes { + build_id, + session_id, + local_working_dir, + git_sources, + prev_git_sources, + dataflow_descriptor, + nodes_on_machine, + uv, + }) => { + match dataflow_descriptor.communication.remote { + dora_core::config::RemoteCommunicationConfig::Tcp => {} + } + + let base_working_dir = self.base_working_dir(local_working_dir, session_id)?; + + let result = self + .build_dataflow( + build_id, + session_id, + base_working_dir, + git_sources, + prev_git_sources, + dataflow_descriptor, + nodes_on_machine, + uv, + ) + .await; + let (trigger_result, result_task) = match result { + Ok(result_task) => (Ok(()), Some(result_task)), + Err(err) => (Err(format!("{err:?}")), None), + }; + let reply = DaemonCoordinatorReply::TriggerBuildResult(trigger_result); + let _ = reply_tx.send(Some(reply)).map_err(|_| { + error!("could not send `TriggerBuildResult` reply from daemon to coordinator") + }); + + let result_tx = self.events_tx.clone(); + let clock = self.clock.clone(); + if let Some(result_task) = result_task { + tokio::spawn(async move { + let message = Timestamped { + inner: Event::BuildDataflowResult { + build_id, + session_id, + result: result_task.await, + }, + timestamp: clock.new_timestamp(), + }; + let _ = result_tx + .send(message) + .map_err(|_| { + error!( + "could not send `BuildResult` reply from daemon to coordinator" + ) + }) + .await; + }); + } + + RunStatus::Continue + } DaemonCoordinatorEvent::Spawn(SpawnDataflowNodes { + build_id, + session_id, dataflow_id, - working_dir, + local_working_dir, nodes, dataflow_descriptor, spawn_nodes, uv, - build_only, }) => { match dataflow_descriptor.communication.remote { dora_core::config::RemoteCommunicationConfig::Tcp => {} } - // Use the working directory if it exists, otherwise use the working directory where the daemon is spawned - let working_dir = if working_dir.exists() { - working_dir - } else { - std::env::current_dir().wrap_err("failed to get current working dir")? - }; + let base_working_dir = self.base_working_dir(local_working_dir, session_id)?; let result = self .spawn_dataflow( + build_id, dataflow_id, - working_dir, + base_working_dir, nodes, dataflow_descriptor, spawn_nodes, uv, - build_only, ) .await; let (trigger_result, result_task) = match result { @@ -524,7 +635,6 @@ impl Daemon { inner: Event::SpawnDataflowResult { dataflow_id, result: result_task.await, - build_only, }, timestamp: clock.new_timestamp(), }; @@ -770,15 +880,100 @@ impl Daemon { } } + #[allow(clippy::too_many_arguments)] + async fn build_dataflow( + &mut self, + build_id: BuildId, + session_id: SessionId, + base_working_dir: PathBuf, + git_sources: BTreeMap, + prev_git_sources: BTreeMap, + dataflow_descriptor: Descriptor, + local_nodes: BTreeSet, + uv: bool, + ) -> eyre::Result>> { + let builder = build::Builder { + session_id, + base_working_dir, + uv, + }; + let nodes = dataflow_descriptor.resolve_aliases_and_set_defaults()?; + + let mut tasks = Vec::new(); + + // build nodes + for node in nodes.into_values().filter(|n| local_nodes.contains(&n.id)) { + let dynamic_node = node.kind.dynamic(); + + let node_id = node.id.clone(); + let mut logger = self.logger.for_node_build(build_id, node_id.clone()); + logger.log(LogLevel::Info, "building").await; + let git_source = git_sources.get(&node_id).cloned(); + let prev_git_source = prev_git_sources.get(&node_id).cloned(); + + let logger_cloned = logger + .try_clone_impl() + .await + .wrap_err("failed to clone logger")?; + + match builder + .clone() + .build_node( + node, + git_source, + prev_git_source, + logger_cloned, + &mut self.git_manager, + ) + .await + .wrap_err_with(|| format!("failed to build node `{node_id}`")) + { + Ok(result) => { + tasks.push(NodeBuildTask { + node_id, + task: result, + dynamic_node, + }); + } + Err(err) => { + logger.log(LogLevel::Error, format!("{err:?}")).await; + return Err(err); + } + } + } + + let task = async move { + let mut info = BuildInfo { + node_working_dirs: Default::default(), + }; + for task in tasks { + let NodeBuildTask { + node_id, + dynamic_node, + task, + } = task; + let node = task + .await + .with_context(|| format!("failed to build node `{node_id}`"))?; + info.node_working_dirs + .insert(node_id, node.node_working_dir); + } + Ok(info) + }; + + Ok(task) + } + + #[allow(clippy::too_many_arguments)] async fn spawn_dataflow( &mut self, - dataflow_id: uuid::Uuid, - working_dir: PathBuf, + build_id: Option, + dataflow_id: DataflowId, + base_working_dir: PathBuf, nodes: BTreeMap, dataflow_descriptor: Descriptor, spawn_nodes: BTreeSet, uv: bool, - build_only: bool, ) -> eyre::Result>> { let mut logger = self .logger @@ -790,7 +985,8 @@ impl Daemon { RunningDataflow::new(dataflow_id, self.daemon_id.clone(), &dataflow_descriptor); let dataflow = match self.running.entry(dataflow_id) { std::collections::hash_map::Entry::Vacant(entry) => { - self.working_dir.insert(dataflow_id, working_dir.clone()); + self.working_dir + .insert(dataflow_id, base_working_dir.clone()); entry.insert(dataflow) } std::collections::hash_map::Entry::Occupied(_) => { @@ -800,6 +996,11 @@ impl Daemon { let mut stopped = Vec::new(); + let node_working_dirs = build_id + .and_then(|build_id| self.builds.get(&build_id)) + .map(|info| info.node_working_dirs.clone()) + .unwrap_or_default(); + // calculate info about mappings for node in nodes.values() { let local = spawn_nodes.contains(&node.id); @@ -838,12 +1039,10 @@ impl Daemon { let spawner = Spawner { dataflow_id, - working_dir, daemon_tx: self.events_tx.clone(), dataflow_descriptor, clock: self.clock.clone(), uv, - build_only, }; let mut tasks = Vec::new(); @@ -869,19 +1068,18 @@ impl Daemon { logger .log(LogLevel::Info, Some("daemon".into()), "spawning") .await; + let node_working_dir = node_working_dirs + .get(&node_id) + .unwrap_or(&base_working_dir) + .clone(); match spawner .clone() - .prepare_node( - node, - node_stderr_most_recent, - &mut logger, - &mut self.repos_in_use, - ) + .spawn_node(node, node_working_dir, node_stderr_most_recent, &mut logger) .await .wrap_err_with(|| format!("failed to spawn node `{node_id}`")) { Ok(result) => { - tasks.push(NodePrepareTask { + tasks.push(NodeBuildTask { node_id, task: result, dynamic_node, @@ -979,7 +1177,7 @@ impl Daemon { async fn spawn_prepared_nodes( dataflow_id: Uuid, mut logger: DataflowLogger<'_>, - tasks: Vec>>>, + tasks: Vec>>>, events_tx: mpsc::Sender>, clock: Arc, ) -> eyre::Result<()> { @@ -995,7 +1193,7 @@ impl Daemon { let mut failed_to_prepare = None; let mut prepared_nodes = Vec::new(); for task in tasks { - let NodePrepareTask { + let NodeBuildTask { node_id, dynamic_node, task, @@ -1567,9 +1765,12 @@ impl Daemon { .clone(), }; - self.repos_in_use.values_mut().for_each(|dataflows| { - dataflows.remove(&dataflow_id); - }); + self.git_manager + .clones_in_use + .values_mut() + .for_each(|dataflows| { + dataflows.remove(&dataflow_id); + }); logger .log( @@ -1799,6 +2000,34 @@ impl Daemon { } Ok(RunStatus::Continue) } + + fn base_working_dir( + &self, + local_working_dir: Option, + session_id: SessionId, + ) -> eyre::Result { + match local_working_dir { + Some(working_dir) => { + // check that working directory exists + if working_dir.exists() { + Ok(working_dir) + } else { + bail!( + "working directory does not exist: {}", + working_dir.display(), + ) + } + } + None => { + // use subfolder of daemon working dir + let daemon_working_dir = + current_dir().context("failed to get daemon working dir")?; + Ok(daemon_working_dir + .join("_work") + .join(session_id.to_string())) + } + } + } } async fn set_up_event_stream( @@ -2272,10 +2501,14 @@ pub enum Event { dynamic_node: bool, result: Result, }, + BuildDataflowResult { + build_id: BuildId, + session_id: SessionId, + result: eyre::Result, + }, SpawnDataflowResult { dataflow_id: Uuid, result: eyre::Result<()>, - build_only: bool, }, } @@ -2298,6 +2531,7 @@ impl Event { Event::SecondCtrlC => "SecondCtrlC", Event::DaemonError(_) => "DaemonError", Event::SpawnNodeResult { .. } => "SpawnNodeResult", + Event::BuildDataflowResult { .. } => "BuildDataflowResult", Event::SpawnDataflowResult { .. } => "SpawnDataflowResult", } } diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index 26488361..c5fe171a 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -4,10 +4,11 @@ use std::{ sync::Arc, }; -use dora_core::{config::NodeId, uhlc}; +use dora_core::{build::BuildLogger, config::NodeId, uhlc}; use dora_message::{ common::{DaemonId, LogLevel, LogMessage, Timestamped}, daemon_to_coordinator::{CoordinatorRequest, DaemonEvent}, + BuildId, }; use eyre::Context; use tokio::net::TcpStream; @@ -81,7 +82,7 @@ impl<'a> DataflowLogger<'a> { message: impl Into, ) { self.logger - .log(level, self.dataflow_id, node_id, target, message) + .log(level, Some(self.dataflow_id), node_id, target, message) .await } @@ -93,6 +94,44 @@ impl<'a> DataflowLogger<'a> { } } +pub struct NodeBuildLogger<'a> { + build_id: BuildId, + node_id: NodeId, + logger: CowMut<'a, DaemonLogger>, +} + +impl NodeBuildLogger<'_> { + pub async fn log(&mut self, level: LogLevel, message: impl Into) { + self.logger + .log_build(self.build_id, level, Some(self.node_id.clone()), message) + .await + } + + pub async fn try_clone_impl(&self) -> eyre::Result> { + Ok(NodeBuildLogger { + build_id: self.build_id, + node_id: self.node_id.clone(), + logger: CowMut::Owned(self.logger.try_clone().await?), + }) + } +} + +impl BuildLogger for NodeBuildLogger<'_> { + type Clone = NodeBuildLogger<'static>; + + fn log_message( + &mut self, + level: LogLevel, + message: impl Into + Send, + ) -> impl std::future::Future + Send { + self.log(level, message) + } + + fn try_clone(&self) -> impl std::future::Future> + Send { + self.try_clone_impl() + } +} + pub struct DaemonLogger { daemon_id: DaemonId, logger: Logger, @@ -106,6 +145,14 @@ impl DaemonLogger { } } + pub fn for_node_build(&mut self, build_id: BuildId, node_id: NodeId) -> NodeBuildLogger { + NodeBuildLogger { + build_id, + node_id, + logger: CowMut::Borrowed(self), + } + } + pub fn inner(&self) -> &Logger { &self.logger } @@ -113,12 +160,13 @@ impl DaemonLogger { pub async fn log( &mut self, level: LogLevel, - dataflow_id: Uuid, + dataflow_id: Option, node_id: Option, target: Option, message: impl Into, ) { let message = LogMessage { + build_id: None, daemon_id: Some(self.daemon_id.clone()), dataflow_id, node_id, @@ -132,6 +180,28 @@ impl DaemonLogger { self.logger.log(message).await } + pub async fn log_build( + &mut self, + build_id: BuildId, + level: LogLevel, + node_id: Option, + message: impl Into, + ) { + let message = LogMessage { + build_id: Some(build_id), + daemon_id: Some(self.daemon_id.clone()), + dataflow_id: None, + node_id, + level, + target: Some("build".into()), + module_path: None, + file: None, + line: None, + message: message.into(), + }; + self.logger.log(message).await + } + pub(crate) fn daemon_id(&self) -> &DaemonId { &self.daemon_id } @@ -181,7 +251,8 @@ impl Logger { match message.level { LogLevel::Error => { tracing::error!( - dataflow_id = message.dataflow_id.to_string(), + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), node_id = ?message.node_id.map(|id| id.to_string()), target = message.target, module_path = message.module_path, @@ -193,7 +264,8 @@ impl Logger { } LogLevel::Warn => { tracing::warn!( - dataflow_id = message.dataflow_id.to_string(), + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), node_id = ?message.node_id.map(|id| id.to_string()), target = message.target, module_path = message.module_path, @@ -205,7 +277,8 @@ impl Logger { } LogLevel::Info => { tracing::info!( - dataflow_id = message.dataflow_id.to_string(), + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), node_id = ?message.node_id.map(|id| id.to_string()), target = message.target, module_path = message.module_path, @@ -217,7 +290,8 @@ impl Logger { } LogLevel::Debug => { tracing::debug!( - dataflow_id = message.dataflow_id.to_string(), + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), node_id = ?message.node_id.map(|id| id.to_string()), target = message.target, module_path = message.module_path, diff --git a/binaries/daemon/src/spawn/git.rs b/binaries/daemon/src/spawn/git.rs deleted file mode 100644 index 9803d1f2..00000000 --- a/binaries/daemon/src/spawn/git.rs +++ /dev/null @@ -1,286 +0,0 @@ -use crate::log::NodeLogger; -use dora_message::{common::LogLevel, descriptor::GitRepoRev, DataflowId}; -use eyre::{ContextCompat, WrapErr}; -use git2::FetchOptions; -use std::{ - collections::{BTreeMap, BTreeSet}, - path::{Path, PathBuf}, -}; -use url::Url; -use uuid::Uuid; - -pub struct GitFolder { - /// The URL of the git repository. - repo_addr: String, - /// The branch, tag, or git revision to checkout. - rev: Option, - /// The directory that should contain the checked-out repository. - clone_dir: PathBuf, - /// Specifies whether an existing repo should be reused. - reuse: ReuseOptions, -} - -impl GitFolder { - pub fn choose_clone_dir( - dataflow_id: uuid::Uuid, - repo_addr: String, - rev: Option, - target_dir: &Path, - repos_in_use: &mut BTreeMap>, - ) -> eyre::Result { - let repo_url = Url::parse(&repo_addr).context("failed to parse git repository URL")?; - - let base_dir = { - let base = { - let mut path = - target_dir.join(repo_url.host_str().context("git URL has no hostname")?); - - path.extend(repo_url.path_segments().context("no path in git URL")?); - path - }; - match &rev { - None => base, - Some(rev) => match rev { - GitRepoRev::Branch(branch) => base.join("branch").join(branch), - GitRepoRev::Tag(tag) => base.join("tag").join(tag), - GitRepoRev::Rev(rev) => base.join("rev").join(rev), - }, - } - }; - let clone_dir = if clone_dir_exists(&base_dir, repos_in_use) { - let used_by_other = used_by_other_dataflow(dataflow_id, &base_dir, repos_in_use); - if used_by_other { - // don't reuse, choose new directory - // (TODO reuse if still up to date) - - let dir_name = base_dir.file_name().unwrap().to_str().unwrap(); - let mut i = 1; - loop { - let new_path = base_dir.with_file_name(format!("{dir_name}-{i}")); - if clone_dir_exists(&new_path, repos_in_use) - && used_by_other_dataflow(dataflow_id, &new_path, repos_in_use) - { - i += 1; - } else { - break new_path; - } - } - } else { - base_dir - } - } else { - base_dir - }; - let clone_dir = dunce::simplified(&clone_dir).to_owned(); - - let reuse = if clone_dir_exists(&clone_dir, repos_in_use) { - let empty = BTreeSet::new(); - let in_use = repos_in_use.get(&clone_dir).unwrap_or(&empty); - let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); - if used_by_other_dataflow { - // The directory is currently in use by another dataflow. We currently don't - // support reusing the same clone across multiple dataflow runs. Above, we - // choose a new directory if we detect such a case. So this `if` branch - // should never be reached. - eyre::bail!("clone_dir is already in use by other dataflow") - } else if in_use.is_empty() { - // The cloned repo is not used by any dataflow, so we can safely reuse it. However, - // the clone might be still on an older commit, so we need to do a `git fetch` - // before we reuse it. - ReuseOptions::ReuseAfterFetch - } else { - // This clone is already used for another node of this dataflow. We will do a - // `git fetch` operation for the first node of this dataflow, so we don't need - // to do it again for other nodes of the dataflow. So we can simply reuse the - // directory without doing any additional git operations. - ReuseOptions::Reuse - } - } else { - ReuseOptions::NewClone - }; - repos_in_use - .entry(clone_dir.clone()) - .or_default() - .insert(dataflow_id); - - Ok(GitFolder { - clone_dir, - reuse, - repo_addr, - rev, - }) - } - - pub async fn prepare(self, logger: &mut NodeLogger<'_>) -> eyre::Result { - let GitFolder { - clone_dir, - reuse, - repo_addr, - rev, - } = self; - - let rev_str = rev_str(&rev); - let refname = rev.clone().map(|rev| match rev { - GitRepoRev::Branch(branch) => format!("refs/remotes/origin/{branch}"), - GitRepoRev::Tag(tag) => format!("refs/tags/{tag}"), - GitRepoRev::Rev(rev) => rev, - }); - - match reuse { - ReuseOptions::NewClone => { - let repository = clone_into(&repo_addr, &rev, &clone_dir, logger).await?; - checkout_tree(&repository, refname)?; - } - ReuseOptions::ReuseAfterFetch => { - logger - .log( - LogLevel::Info, - None, - format!("fetching changes and reusing {repo_addr}{rev_str}"), - ) - .await; - let refname_cloned = refname.clone(); - let clone_dir = clone_dir.clone(); - let repository = fetch_changes(clone_dir, refname_cloned).await?; - checkout_tree(&repository, refname)?; - } - ReuseOptions::Reuse => { - logger - .log( - LogLevel::Info, - None, - format!("reusing up-to-date {repo_addr}{rev_str}"), - ) - .await; - } - }; - Ok(clone_dir) - } -} - -fn used_by_other_dataflow( - dataflow_id: uuid::Uuid, - clone_dir_base: &PathBuf, - repos_in_use: &mut BTreeMap>, -) -> bool { - let empty = BTreeSet::new(); - let in_use = repos_in_use.get(clone_dir_base).unwrap_or(&empty); - let used_by_other_dataflow = in_use.iter().any(|&id| id != dataflow_id); - used_by_other_dataflow -} - -enum ReuseOptions { - /// Create a new clone of the repository. - NewClone, - /// Reuse an existing up-to-date clone of the repository. - Reuse, - /// Update an older clone of the repository, then reuse it. - ReuseAfterFetch, -} - -fn rev_str(rev: &Option) -> String { - match rev { - Some(GitRepoRev::Branch(branch)) => format!(" (branch {branch})"), - Some(GitRepoRev::Tag(tag)) => format!(" (tag {tag})"), - Some(GitRepoRev::Rev(rev)) => format!(" (rev {rev})"), - None => String::new(), - } -} - -async fn clone_into( - repo_addr: &String, - rev: &Option, - clone_dir: &Path, - logger: &mut NodeLogger<'_>, -) -> eyre::Result { - if let Some(parent) = clone_dir.parent() { - tokio::fs::create_dir_all(parent) - .await - .context("failed to create parent directory for git clone")?; - } - - let rev_str = rev_str(rev); - logger - .log( - LogLevel::Info, - None, - format!("cloning {repo_addr}{rev_str} into {}", clone_dir.display()), - ) - .await; - let rev: Option = rev.clone(); - let clone_into = clone_dir.to_owned(); - let repo_addr = repo_addr.clone(); - let task = tokio::task::spawn_blocking(move || { - let mut builder = git2::build::RepoBuilder::new(); - let mut fetch_options = git2::FetchOptions::new(); - fetch_options.download_tags(git2::AutotagOption::All); - builder.fetch_options(fetch_options); - if let Some(GitRepoRev::Branch(branch)) = &rev { - builder.branch(branch); - } - builder - .clone(&repo_addr, &clone_into) - .context("failed to clone repo") - }); - let repo = task.await??; - Ok(repo) -} - -async fn fetch_changes( - repo_dir: PathBuf, - refname: Option, -) -> Result { - let fetch_changes = tokio::task::spawn_blocking(move || { - let repository = git2::Repository::open(&repo_dir).context("failed to open git repo")?; - - { - let mut remote = repository - .find_remote("origin") - .context("failed to find remote `origin` in repo")?; - remote - .connect(git2::Direction::Fetch) - .context("failed to connect to remote")?; - let default_branch = remote - .default_branch() - .context("failed to get default branch for remote")?; - let fetch = match &refname { - Some(refname) => refname, - None => default_branch - .as_str() - .context("failed to read default branch as string")?, - }; - let mut fetch_options = FetchOptions::new(); - fetch_options.download_tags(git2::AutotagOption::All); - remote - .fetch(&[&fetch], Some(&mut fetch_options), None) - .context("failed to fetch from git repo")?; - } - Result::<_, eyre::Error>::Ok(repository) - }); - let repository = fetch_changes.await??; - Ok(repository) -} - -fn checkout_tree(repository: &git2::Repository, refname: Option) -> eyre::Result<()> { - if let Some(refname) = refname { - let (object, reference) = repository - .revparse_ext(&refname) - .context("failed to parse ref")?; - repository - .checkout_tree(&object, None) - .context("failed to checkout ref")?; - match reference { - Some(reference) => repository - .set_head(reference.name().context("failed to get reference_name")?) - .context("failed to set head")?, - None => repository - .set_head_detached(object.id()) - .context("failed to set detached head")?, - } - } - Ok(()) -} - -fn clone_dir_exists(dir: &PathBuf, repos_in_use: &BTreeMap>) -> bool { - repos_in_use.contains_key(dir) || dir.exists() -} diff --git a/binaries/daemon/src/spawn/mod.rs b/binaries/daemon/src/spawn/mod.rs index 9bf15360..055045a9 100644 --- a/binaries/daemon/src/spawn/mod.rs +++ b/binaries/daemon/src/spawn/mod.rs @@ -7,11 +7,10 @@ use aligned_vec::{AVec, ConstAlign}; use crossbeam::queue::ArrayQueue; use dora_arrow_convert::IntoArrow; use dora_core::{ - build::run_build_command, config::DataId, descriptor::{ - resolve_path, source_is_url, CustomNode, Descriptor, OperatorDefinition, OperatorSource, - PythonSource, ResolvedNode, ResolvedNodeExt, DYNAMIC_SOURCE, SHELL_SOURCE, + resolve_path, source_is_url, Descriptor, OperatorDefinition, OperatorSource, PythonSource, + ResolvedNode, ResolvedNodeExt, DYNAMIC_SOURCE, SHELL_SOURCE, }, get_python_path, uhlc::HLC, @@ -21,7 +20,6 @@ use dora_message::{ common::{LogLevel, LogMessage}, daemon_to_coordinator::{DataMessage, NodeExitStatus, Timestamped}, daemon_to_node::{NodeConfig, RuntimeConfig}, - descriptor::EnvValue, id::NodeId, DataflowId, }; @@ -31,9 +29,7 @@ use dora_node_api::{ Metadata, }; use eyre::{ContextCompat, WrapErr}; -use git::GitFolder; use std::{ - collections::{BTreeMap, BTreeSet}, future::Future, path::{Path, PathBuf}, process::Stdio, @@ -46,27 +42,23 @@ use tokio::{ }; use tracing::error; -mod git; - #[derive(Clone)] pub struct Spawner { pub dataflow_id: DataflowId, - pub working_dir: PathBuf, pub daemon_tx: mpsc::Sender>, pub dataflow_descriptor: Descriptor, /// clock is required for generating timestamps when dropping messages early because queue is full pub clock: Arc, pub uv: bool, - pub build_only: bool, } impl Spawner { - pub async fn prepare_node( + pub async fn spawn_node( self, node: ResolvedNode, + node_working_dir: PathBuf, node_stderr_most_recent: Arc>, logger: &mut NodeLogger<'_>, - repos_in_use: &mut BTreeMap>, ) -> eyre::Result>> { let dataflow_id = self.dataflow_id; let node_id = node.id.clone(); @@ -101,24 +93,6 @@ impl Spawner { dynamic: node.kind.dynamic(), }; - let prepared_git = if let dora_core::descriptor::CoreNodeKind::Custom(CustomNode { - source: dora_message::descriptor::NodeSource::GitBranch { repo, rev }, - .. - }) = &node.kind - { - let target_dir = self.working_dir.join("build"); - let git_folder = GitFolder::choose_clone_dir( - self.dataflow_id, - repo.clone(), - rev.clone(), - &target_dir, - repos_in_use, - )?; - Some(git_folder) - } else { - None - }; - let mut logger = logger .try_clone() .await @@ -126,10 +100,10 @@ impl Spawner { let task = async move { self.prepare_node_inner( node, + node_working_dir, &mut logger, dataflow_id, node_config, - prepared_git, node_stderr_most_recent, ) .await @@ -138,33 +112,21 @@ impl Spawner { } async fn prepare_node_inner( - mut self, + self, node: ResolvedNode, + node_working_dir: PathBuf, logger: &mut NodeLogger<'_>, dataflow_id: uuid::Uuid, node_config: NodeConfig, - git_folder: Option, node_stderr_most_recent: Arc>, ) -> eyre::Result { let (command, error_msg) = match &node.kind { dora_core::descriptor::CoreNodeKind::Custom(n) => { - let build_dir = match git_folder { - Some(git_folder) => git_folder.prepare(logger).await?, - None => self.working_dir.clone(), - }; - - if let Some(build) = &n.build { - self.build_node(logger, &node.env, build_dir.clone(), build) - .await?; - } - let mut command = if self.build_only { - None - } else { - path_spawn_command(&build_dir, self.uv, logger, n, true).await? - }; + let mut command = + path_spawn_command(&node_working_dir, self.uv, logger, n, true).await?; if let Some(command) = &mut command { - command.current_dir(&self.working_dir); + command.current_dir(&node_working_dir); command.stdin(Stdio::null()); command.env( @@ -205,14 +167,6 @@ impl Spawner { (command, error_msg) } dora_core::descriptor::CoreNodeKind::Runtime(n) => { - // run build commands - for operator in &n.operators { - if let Some(build) = &operator.config.build { - self.build_node(logger, &node.env, self.working_dir.clone(), build) - .await?; - } - } - let python_operators: Vec<&OperatorDefinition> = n .operators .iter() @@ -224,9 +178,7 @@ impl Spawner { .iter() .any(|x| !matches!(x.config.source, OperatorSource::Python { .. })); - let mut command = if self.build_only { - None - } else if !python_operators.is_empty() && !other_operators { + let mut command = if !python_operators.is_empty() && !other_operators { // Use python to spawn runtime if there is a python operator // TODO: Handle multi-operator runtime once sub-interpreter is supported @@ -304,7 +256,7 @@ impl Spawner { }; if let Some(command) = &mut command { - command.current_dir(&self.working_dir); + command.current_dir(&node_working_dir); command.env( "DORA_RUNTIME_CONFIG", @@ -337,7 +289,7 @@ impl Spawner { Ok(PreparedNode { command, spawn_error_msg: error_msg, - working_dir: self.working_dir, + node_working_dir, dataflow_id, node, node_config, @@ -346,50 +298,12 @@ impl Spawner { node_stderr_most_recent, }) } - - async fn build_node( - &mut self, - logger: &mut NodeLogger<'_>, - node_env: &Option>, - working_dir: PathBuf, - build: &String, - ) -> Result<(), eyre::Error> { - logger - .log( - LogLevel::Info, - None, - format!("running build command: `{build}"), - ) - .await; - let build = build.to_owned(); - let uv = self.uv; - let node_env = node_env.clone(); - let mut logger = logger.try_clone().await.context("failed to clone logger")?; - let (stdout_tx, mut stdout) = tokio::sync::mpsc::channel(10); - let task = tokio::task::spawn_blocking(move || { - run_build_command(&build, &working_dir, uv, &node_env, stdout_tx) - .context("build command failed") - }); - tokio::spawn(async move { - while let Some(line) = stdout.recv().await { - logger - .log( - LogLevel::Info, - Some("build command".into()), - line.unwrap_or_else(|err| format!("io err: {}", err.kind())), - ) - .await; - } - }); - task.await??; - Ok(()) - } } pub struct PreparedNode { command: Option, spawn_error_msg: String, - working_dir: PathBuf, + node_working_dir: PathBuf, dataflow_id: DataflowId, node: ResolvedNode, node_config: NodeConfig, @@ -430,7 +344,7 @@ impl PreparedNode { .await; let dataflow_dir: PathBuf = self - .working_dir + .node_working_dir .join("out") .join(self.dataflow_id.to_string()); if !dataflow_dir.exists() { @@ -438,7 +352,7 @@ impl PreparedNode { } let (tx, mut rx) = mpsc::channel(10); let mut file = File::create(log::log_path( - &self.working_dir, + &self.node_working_dir, &self.dataflow_id, &self.node.id, )) @@ -642,7 +556,8 @@ impl PreparedNode { cloned_logger .log(LogMessage { daemon_id: Some(daemon_id.clone()), - dataflow_id, + dataflow_id: Some(dataflow_id), + build_id: None, level: LogLevel::Info, node_id: Some(node_id.clone()), target: Some("stdout".into()), diff --git a/examples/benchmark/run.rs b/examples/benchmark/run.rs index 8e0076bc..b6bed6fe 100644 --- a/examples/benchmark/run.rs +++ b/examples/benchmark/run.rs @@ -11,12 +11,26 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); + build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/c++-dataflow/.gitignore b/examples/c++-dataflow/.gitignore index 5761abcf..d255f72c 100644 --- a/examples/c++-dataflow/.gitignore +++ b/examples/c++-dataflow/.gitignore @@ -1 +1,2 @@ *.o +/build diff --git a/examples/c++-ros2-dataflow/.gitignore b/examples/c++-ros2-dataflow/.gitignore index 5761abcf..d255f72c 100644 --- a/examples/c++-ros2-dataflow/.gitignore +++ b/examples/c++-ros2-dataflow/.gitignore @@ -1 +1,2 @@ *.o +/build diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index 130d43c1..4d8a40e1 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -1,3 +1,4 @@ +use dora_cli::session::DataflowSession; use dora_coordinator::{ControlEvent, Event}; use dora_core::{ descriptor::{read_as_descriptor, DescriptorExt}, @@ -37,6 +38,7 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); + build_dataflow(dataflow).await?; let (coordinator_events_tx, coordinator_events_rx) = mpsc::channel(1); let coordinator_bind = SocketAddr::new( @@ -138,15 +140,19 @@ async fn start_dataflow( .check(&working_dir) .wrap_err("could not validate yaml")?; + let dataflow_session = + DataflowSession::read_session(dataflow).context("failed to read DataflowSession")?; + let (reply_sender, reply) = oneshot::channel(); coordinator_events_tx .send(Event::Control(ControlEvent::IncomingRequest { request: ControlRequest::Start { + build_id: dataflow_session.build_id, + session_id: dataflow_session.session_id, dataflow: dataflow_descriptor, - local_working_dir: working_dir, + local_working_dir: Some(working_dir), name: None, uv: false, - build_only: false, }, reply_sender, })) @@ -228,6 +234,18 @@ async fn destroy(coordinator_events_tx: &Sender) -> eyre::Result<()> { } } +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + async fn run_daemon(coordinator: String, machine_id: &str) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/python-ros2-dataflow/run.rs b/examples/python-ros2-dataflow/run.rs index 2873426e..23b254e2 100644 --- a/examples/python-ros2-dataflow/run.rs +++ b/examples/python-ros2-dataflow/run.rs @@ -40,6 +40,15 @@ async fn main() -> eyre::Result<()> { async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); + // First build the dataflow (install requirements) + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); + if !cmd.status().await?.success() { + bail!("failed to run dataflow"); + }; + let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); diff --git a/examples/rust-dataflow-git/.gitignore b/examples/rust-dataflow-git/.gitignore new file mode 100644 index 00000000..dfdc87e3 --- /dev/null +++ b/examples/rust-dataflow-git/.gitignore @@ -0,0 +1,2 @@ +/build +/git diff --git a/examples/rust-dataflow-git/dataflow.yml b/examples/rust-dataflow-git/dataflow.yml index f4bca5df..a64b2170 100644 --- a/examples/rust-dataflow-git/dataflow.yml +++ b/examples/rust-dataflow-git/dataflow.yml @@ -1,7 +1,7 @@ nodes: - id: rust-node git: https://github.com/dora-rs/dora.git - rev: e31b2a34 # pinned commit, update this when changing the message crate + rev: 64a2dc9c # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-node path: target/debug/rust-dataflow-example-node inputs: @@ -11,7 +11,7 @@ nodes: - id: rust-status-node git: https://github.com/dora-rs/dora.git - rev: e31b2a34 # pinned commit, update this when changing the message crate + rev: 64a2dc9c # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-status-node path: target/debug/rust-dataflow-example-status-node inputs: @@ -22,7 +22,7 @@ nodes: - id: rust-sink git: https://github.com/dora-rs/dora.git - rev: e31b2a34 # pinned commit, update this when changing the message crate + rev: 64a2dc9c # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-sink path: target/debug/rust-dataflow-example-sink inputs: diff --git a/examples/rust-dataflow-git/run.rs b/examples/rust-dataflow-git/run.rs index 6a6a8782..490c5c57 100644 --- a/examples/rust-dataflow-git/run.rs +++ b/examples/rust-dataflow-git/run.rs @@ -16,12 +16,25 @@ async fn main() -> eyre::Result<()> { } else { Path::new("dataflow.yml") }; + build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/rust-dataflow-url/.gitignore b/examples/rust-dataflow-url/.gitignore new file mode 100644 index 00000000..796b96d1 --- /dev/null +++ b/examples/rust-dataflow-url/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index e93a5d28..6f511970 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -11,12 +11,25 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); + build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 6a6a8782..490c5c57 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -16,12 +16,25 @@ async fn main() -> eyre::Result<()> { } else { Path::new("dataflow.yml") }; + build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 7d9233b1..af467ffe 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -19,8 +19,11 @@ which = "5.0.0" uuid = { version = "1.7", features = ["serde", "v7"] } tracing = "0.1" serde-with-expand-env = "1.1.0" -tokio = { version = "1.24.1", features = ["fs", "process", "sync"] } +tokio = { version = "1.24.1", features = ["fs", "process", "sync", "rt"] } schemars = "0.8.19" serde_json = "1.0.117" log = { version = "0.4.21", features = ["serde"] } dunce = "1.0.5" +url = "2.5.4" +git2 = { workspace = true } +itertools = "0.14" diff --git a/libraries/core/src/build.rs b/libraries/core/src/build/build_command.rs similarity index 100% rename from libraries/core/src/build.rs rename to libraries/core/src/build/build_command.rs diff --git a/libraries/core/src/build/git.rs b/libraries/core/src/build/git.rs new file mode 100644 index 00000000..f53a5c9e --- /dev/null +++ b/libraries/core/src/build/git.rs @@ -0,0 +1,353 @@ +use crate::build::BuildLogger; +use dora_message::{common::LogLevel, descriptor::GitRepoRev, DataflowId, SessionId}; +use eyre::{bail, ContextCompat, WrapErr}; +use git2::FetchOptions; +use itertools::Itertools; +use std::{ + collections::{BTreeMap, BTreeSet}, + path::{Path, PathBuf}, +}; +use url::Url; + +#[derive(Default)] +pub struct GitManager { + /// Directories that are currently in use by running dataflows. + pub clones_in_use: BTreeMap>, + /// Builds that are prepared, but not done yet. + prepared_builds: BTreeMap, + reuse_for: BTreeMap, +} + +#[derive(Default)] +struct PreparedBuild { + /// Clone dirs that will be created during the build process. + /// + /// This allows subsequent nodes to reuse the dirs. + planned_clone_dirs: BTreeSet, +} + +impl GitManager { + pub fn choose_clone_dir( + &mut self, + session_id: SessionId, + repo_url: Url, + commit_hash: String, + prev_commit_hash: Option, + target_dir: &Path, + ) -> eyre::Result { + let clone_dir = Self::clone_dir_path(&target_dir, &repo_url, &commit_hash)?; + + if let Some(using) = self.clones_in_use.get(&clone_dir) { + if !using.is_empty() { + // The directory is currently in use by another dataflow. Rebuilding + // while a dataflow is running could lead to unintended behavior. + eyre::bail!( + "the build directory is still in use by the following \ + dataflows, please stop them before rebuilding: {}", + using.iter().join(", ") + ) + } + } + + let reuse = if self.clone_dir_ready(session_id, &clone_dir) { + // The directory already contains a checkout of the commit we're interested in. + // So we can simply reuse the directory without doing any additional git + // operations. + ReuseOptions::Reuse { + dir: clone_dir.clone(), + } + } else if let Some(previous_commit_hash) = prev_commit_hash { + // we might be able to update a previous clone + let prev_clone_dir = + Self::clone_dir_path(&target_dir, &repo_url, &previous_commit_hash)?; + + if self + .clones_in_use + .get(&prev_clone_dir) + .map(|ids| !ids.is_empty()) + .unwrap_or(false) + { + // previous clone is still in use -> we cannot rename it, but we can copy it + ReuseOptions::CopyAndFetch { + from: prev_clone_dir, + target_dir: clone_dir.clone(), + commit_hash, + } + } else if prev_clone_dir.exists() { + // there is an unused previous clone that is not in use -> rename it + ReuseOptions::RenameAndFetch { + from: prev_clone_dir, + target_dir: clone_dir.clone(), + commit_hash, + } + } else { + // no existing clone associated with previous build id + ReuseOptions::NewClone { + target_dir: clone_dir.clone(), + repo_url, + commit_hash, + } + } + } else { + // no previous build that we can reuse + ReuseOptions::NewClone { + target_dir: clone_dir.clone(), + repo_url, + commit_hash, + } + }; + self.register_ready_clone_dir(session_id, clone_dir); + + Ok(GitFolder { reuse }) + } + + pub fn in_use(&self, dir: &Path) -> bool { + self.clones_in_use + .get(dir) + .map(|ids| !ids.is_empty()) + .unwrap_or(false) + } + + pub fn clone_dir_ready(&self, session_id: SessionId, dir: &Path) -> bool { + self.prepared_builds + .get(&session_id) + .map(|p| p.planned_clone_dirs.contains(dir)) + .unwrap_or(false) + || dir.exists() + } + + pub fn register_ready_clone_dir(&mut self, session_id: SessionId, dir: PathBuf) -> bool { + self.prepared_builds + .entry(session_id) + .or_default() + .planned_clone_dirs + .insert(dir) + } + + fn clone_dir_path( + base_dir: &Path, + repo_url: &Url, + commit_hash: &String, + ) -> eyre::Result { + let mut path = base_dir.join(repo_url.host_str().context("git URL has no hostname")?); + path.extend(repo_url.path_segments().context("no path in git URL")?); + let path = path.join(commit_hash); + Ok(dunce::simplified(&path).to_owned()) + } +} + +pub struct GitFolder { + /// Specifies whether an existing repo should be reused. + reuse: ReuseOptions, +} + +impl GitFolder { + pub async fn prepare(self, logger: &mut impl BuildLogger) -> eyre::Result { + let GitFolder { reuse } = self; + + eprintln!("reuse: {reuse:?}"); + let clone_dir = match reuse { + ReuseOptions::NewClone { + target_dir, + repo_url, + commit_hash, + } => { + logger + .log_message( + LogLevel::Info, + format!( + "cloning {repo_url}#{commit_hash} into {}", + target_dir.display() + ), + ) + .await; + let clone_target = target_dir.clone(); + let checkout_result = tokio::task::spawn_blocking(move || { + let repository = clone_into(repo_url.clone(), &clone_target) + .with_context(|| format!("failed to clone git repo from `{repo_url}`"))?; + checkout_tree(&repository, &commit_hash) + .with_context(|| format!("failed to checkout commit `{commit_hash}`")) + }) + .await + .unwrap(); + + match checkout_result { + Ok(()) => target_dir, + Err(err) => { + logger + .log_message(LogLevel::Error, format!("{err:?}")) + .await; + // remove erroneous clone again + if let Err(err) = std::fs::remove_dir_all(target_dir) { + logger + .log_message( + LogLevel::Error, + format!( + "failed to remove clone dir after clone/checkout error: {}", + err.kind() + ), + ) + .await; + } + bail!(err) + } + } + } + ReuseOptions::CopyAndFetch { + from, + target_dir, + commit_hash, + } => { + tokio::fs::copy(&from, &target_dir) + .await + .context("failed to copy repo clone")?; + + logger + .log_message( + LogLevel::Info, + format!("fetching changes after copying {}", from.display()), + ) + .await; + + let repository = fetch_changes(&target_dir, None).await?; + checkout_tree(&repository, &commit_hash)?; + target_dir + } + ReuseOptions::RenameAndFetch { + from, + target_dir, + commit_hash, + } => { + tokio::fs::rename(&from, &target_dir) + .await + .context("failed to rename repo clone")?; + + logger + .log_message( + LogLevel::Info, + format!("fetching changes after renaming {}", from.display()), + ) + .await; + + let repository = fetch_changes(&target_dir, None).await?; + checkout_tree(&repository, &commit_hash)?; + target_dir + } + ReuseOptions::Reuse { dir } => { + logger + .log_message( + LogLevel::Info, + format!("reusing up-to-date {}", dir.display()), + ) + .await; + dir + } + }; + Ok(clone_dir) + } +} + +#[derive(Debug)] +enum ReuseOptions { + /// Create a new clone of the repository. + NewClone { + target_dir: PathBuf, + repo_url: Url, + commit_hash: String, + }, + /// Reuse an existing up-to-date clone of the repository. + Reuse { dir: PathBuf }, + /// Copy an older clone of the repository and fetch changes, then reuse it. + CopyAndFetch { + from: PathBuf, + target_dir: PathBuf, + commit_hash: String, + }, + /// Rename an older clone of the repository and fetch changes, then reuse it. + RenameAndFetch { + from: PathBuf, + target_dir: PathBuf, + commit_hash: String, + }, +} + +fn rev_str(rev: &Option) -> String { + match rev { + Some(GitRepoRev::Branch(branch)) => format!(" (branch {branch})"), + Some(GitRepoRev::Tag(tag)) => format!(" (tag {tag})"), + Some(GitRepoRev::Rev(rev)) => format!(" (rev {rev})"), + None => String::new(), + } +} + +fn clone_into(repo_addr: Url, clone_dir: &Path) -> eyre::Result { + if let Some(parent) = clone_dir.parent() { + std::fs::create_dir_all(parent) + .context("failed to create parent directory for git clone")?; + } + + let clone_dir = clone_dir.to_owned(); + + let mut builder = git2::build::RepoBuilder::new(); + let mut fetch_options = git2::FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + builder.fetch_options(fetch_options); + builder + .clone(repo_addr.as_str(), &clone_dir) + .context("failed to clone repo") +} + +async fn fetch_changes( + repo_dir: &Path, + refname: Option, +) -> Result { + let repo_dir = repo_dir.to_owned(); + let fetch_changes = tokio::task::spawn_blocking(move || { + let repository = git2::Repository::open(&repo_dir).context("failed to open git repo")?; + + { + let mut remote = repository + .find_remote("origin") + .context("failed to find remote `origin` in repo")?; + remote + .connect(git2::Direction::Fetch) + .context("failed to connect to remote")?; + let default_branch = remote + .default_branch() + .context("failed to get default branch for remote")?; + let fetch = match &refname { + Some(refname) => refname, + None => default_branch + .as_str() + .context("failed to read default branch as string")?, + }; + let mut fetch_options = FetchOptions::new(); + fetch_options.download_tags(git2::AutotagOption::All); + remote + .fetch(&[&fetch], Some(&mut fetch_options), None) + .context("failed to fetch from git repo")?; + } + Result::<_, eyre::Error>::Ok(repository) + }); + let repository = fetch_changes.await??; + Ok(repository) +} + +fn checkout_tree(repository: &git2::Repository, commit_hash: &str) -> eyre::Result<()> { + let (object, reference) = repository + .revparse_ext(commit_hash) + .context("failed to parse ref")?; + repository + .checkout_tree(&object, None) + .context("failed to checkout ref")?; + match reference { + Some(reference) => repository + .set_head(reference.name().context("failed to get reference_name")?) + .context("failed to set head")?, + None => repository + .set_head_detached(object.id()) + .context("failed to set detached head")?, + } + + Ok(()) +} diff --git a/libraries/core/src/build/logger.rs b/libraries/core/src/build/logger.rs new file mode 100644 index 00000000..d683bcd4 --- /dev/null +++ b/libraries/core/src/build/logger.rs @@ -0,0 +1,15 @@ +use std::future::Future; + +use dora_message::common::LogLevel; + +pub trait BuildLogger: Send { + type Clone: BuildLogger + 'static; + + fn log_message( + &mut self, + level: LogLevel, + message: impl Into + Send, + ) -> impl Future + Send; + + fn try_clone(&self) -> impl Future> + Send; +} diff --git a/libraries/core/src/build/mod.rs b/libraries/core/src/build/mod.rs new file mode 100644 index 00000000..3995b222 --- /dev/null +++ b/libraries/core/src/build/mod.rs @@ -0,0 +1,139 @@ +pub use git::GitManager; +pub use logger::BuildLogger; + +use url::Url; + +use std::{collections::BTreeMap, future::Future, path::PathBuf}; + +use crate::descriptor::ResolvedNode; +use dora_message::{ + common::{GitSource, LogLevel}, + descriptor::{CoreNodeKind, EnvValue}, + id::NodeId, + SessionId, +}; +use eyre::Context; + +use build_command::run_build_command; +use git::GitFolder; + +mod build_command; +mod git; +mod logger; + +#[derive(Clone)] +pub struct Builder { + pub session_id: SessionId, + pub base_working_dir: PathBuf, + pub uv: bool, +} + +impl Builder { + pub async fn build_node( + self, + node: ResolvedNode, + git: Option, + prev_git: Option, + mut logger: impl BuildLogger, + git_manager: &mut GitManager, + ) -> eyre::Result>> { + let prepared_git = if let Some(GitSource { repo, commit_hash }) = git { + let repo_url = Url::parse(&repo).context("failed to parse git repository URL")?; + let target_dir = self.base_working_dir.join("git"); + let prev_hash = prev_git.filter(|p| p.repo == repo).map(|p| p.commit_hash); + let git_folder = git_manager.choose_clone_dir( + self.session_id, + repo_url, + commit_hash, + prev_hash, + &target_dir, + )?; + Some(git_folder) + } else { + None + }; + + let task = async move { self.build_node_inner(node, &mut logger, prepared_git).await }; + Ok(task) + } + + async fn build_node_inner( + self, + node: ResolvedNode, + logger: &mut impl BuildLogger, + git_folder: Option, + ) -> eyre::Result { + logger.log_message(LogLevel::Debug, "building node").await; + let node_working_dir = match &node.kind { + CoreNodeKind::Custom(n) => { + let node_working_dir = match git_folder { + Some(git_folder) => git_folder.prepare(logger).await?, + None => self.base_working_dir, + }; + + if let Some(build) = &n.build { + build_node(logger, &node.env, node_working_dir.clone(), build, self.uv).await?; + } + node_working_dir + } + CoreNodeKind::Runtime(n) => { + // run build commands + for operator in &n.operators { + if let Some(build) = &operator.config.build { + build_node( + logger, + &node.env, + self.base_working_dir.clone(), + build, + self.uv, + ) + .await?; + } + } + self.base_working_dir.clone() + } + }; + Ok(BuiltNode { node_working_dir }) + } +} + +async fn build_node( + logger: &mut impl BuildLogger, + node_env: &Option>, + working_dir: PathBuf, + build: &String, + uv: bool, +) -> eyre::Result<()> { + logger + .log_message(LogLevel::Info, format!("running build command: `{build}")) + .await; + let build = build.to_owned(); + let node_env = node_env.clone(); + let mut logger = logger.try_clone().await.context("failed to clone logger")?; + let (stdout_tx, mut stdout) = tokio::sync::mpsc::channel(10); + let task = tokio::task::spawn_blocking(move || { + run_build_command(&build, &working_dir, uv, &node_env, stdout_tx) + .context("build command failed") + }); + tokio::spawn(async move { + while let Some(line) = stdout.recv().await { + logger + .log_message( + LogLevel::Info, + line.unwrap_or_else(|err| format!("io err: {}", err.kind())), + ) + .await; + } + }); + task.await??; + Ok(()) +} + +pub struct BuiltNode { + pub node_working_dir: PathBuf, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct BuildInfo { + pub node_working_dirs: BTreeMap, +} diff --git a/libraries/core/src/git.rs b/libraries/core/src/git.rs new file mode 100644 index 00000000..e69de29b diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index 90f2c564..f18a6059 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -9,6 +9,7 @@ pub use dora_message::{config, uhlc}; pub mod build; pub mod descriptor; +pub mod git; pub mod metadata; pub mod topics; diff --git a/libraries/message/src/cli_to_coordinator.rs b/libraries/message/src/cli_to_coordinator.rs index 456bb1bd..bf3d3a03 100644 --- a/libraries/message/src/cli_to_coordinator.rs +++ b/libraries/message/src/cli_to_coordinator.rs @@ -1,22 +1,48 @@ -use std::{path::PathBuf, time::Duration}; +use std::{collections::BTreeMap, path::PathBuf, time::Duration}; use uuid::Uuid; use crate::{ + common::GitSource, descriptor::Descriptor, id::{NodeId, OperatorId}, + BuildId, SessionId, }; #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub enum ControlRequest { + Build { + session_id: SessionId, + dataflow: Descriptor, + git_sources: BTreeMap, + prev_git_sources: BTreeMap, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + local_working_dir: Option, + uv: bool, + }, + WaitForBuild { + build_id: BuildId, + }, Start { + build_id: Option, + session_id: SessionId, dataflow: Descriptor, name: Option, - // TODO: remove this once we figure out deploying of node/operator - // binaries from CLI to coordinator/daemon - local_working_dir: PathBuf, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + local_working_dir: Option, uv: bool, - build_only: bool, }, WaitForSpawn { dataflow_id: Uuid, @@ -50,4 +76,9 @@ pub enum ControlRequest { dataflow_id: Uuid, level: log::LevelFilter, }, + BuildLogSubscribe { + build_id: BuildId, + level: log::LevelFilter, + }, + CliAndDefaultDaemonOnSameMachine, } diff --git a/libraries/message/src/common.rs b/libraries/message/src/common.rs index 015b163e..83591811 100644 --- a/libraries/message/src/common.rs +++ b/libraries/message/src/common.rs @@ -5,14 +5,15 @@ use aligned_vec::{AVec, ConstAlign}; use eyre::Context as _; use uuid::Uuid; -use crate::{daemon_to_daemon::InterDaemonEvent, id::NodeId, DataflowId}; +use crate::{daemon_to_daemon::InterDaemonEvent, id::NodeId, BuildId, DataflowId}; pub use log::Level as LogLevel; #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[must_use] pub struct LogMessage { - pub dataflow_id: DataflowId, + pub build_id: Option, + pub dataflow_id: Option, pub node_id: Option, pub daemon_id: Option, pub level: LogLevel, @@ -239,3 +240,9 @@ impl std::fmt::Display for DaemonId { write!(f, "{}", self.uuid) } } + +#[derive(Debug, serde::Deserialize, serde::Serialize, Clone, PartialEq, Eq)] +pub struct GitSource { + pub repo: String, + pub commit_hash: String, +} diff --git a/libraries/message/src/coordinator_to_cli.rs b/libraries/message/src/coordinator_to_cli.rs index 87eb7ae7..02243468 100644 --- a/libraries/message/src/coordinator_to_cli.rs +++ b/libraries/message/src/coordinator_to_cli.rs @@ -1,23 +1,46 @@ -use std::collections::{BTreeMap, BTreeSet}; +use std::{ + collections::{BTreeMap, BTreeSet}, + net::IpAddr, +}; use uuid::Uuid; pub use crate::common::{LogLevel, LogMessage, NodeError, NodeErrorCause, NodeExitStatus}; -use crate::{common::DaemonId, id::NodeId}; +use crate::{common::DaemonId, id::NodeId, BuildId}; #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub enum ControlRequestReply { Error(String), CoordinatorStopped, - DataflowStartTriggered { uuid: Uuid }, - DataflowSpawned { uuid: Uuid }, - DataflowReloaded { uuid: Uuid }, - DataflowStopped { uuid: Uuid, result: DataflowResult }, + DataflowBuildTriggered { + build_id: BuildId, + }, + DataflowBuildFinished { + build_id: BuildId, + result: Result<(), String>, + }, + DataflowStartTriggered { + uuid: Uuid, + }, + DataflowSpawned { + uuid: Uuid, + }, + DataflowReloaded { + uuid: Uuid, + }, + DataflowStopped { + uuid: Uuid, + result: DataflowResult, + }, DataflowList(DataflowList), DestroyOk, DaemonConnected(bool), ConnectedDaemons(BTreeSet), Logs(Vec), + CliAndDefaultDaemonIps { + default_daemon: Option, + cli: Option, + }, } #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] diff --git a/libraries/message/src/coordinator_to_daemon.rs b/libraries/message/src/coordinator_to_daemon.rs index 8c68a6ca..69da8923 100644 --- a/libraries/message/src/coordinator_to_daemon.rs +++ b/libraries/message/src/coordinator_to_daemon.rs @@ -5,10 +5,10 @@ use std::{ }; use crate::{ - common::DaemonId, + common::{DaemonId, GitSource}, descriptor::{Descriptor, ResolvedNode}, id::{NodeId, OperatorId}, - DataflowId, + BuildId, DataflowId, SessionId, }; pub use crate::common::Timestamped; @@ -33,6 +33,7 @@ impl RegisterResult { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorEvent { + Build(BuildDataflowNodes), Spawn(SpawnDataflowNodes), AllNodesReady { dataflow_id: DataflowId, @@ -55,13 +56,40 @@ pub enum DaemonCoordinatorEvent { Heartbeat, } +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct BuildDataflowNodes { + pub build_id: BuildId, + pub session_id: SessionId, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + pub local_working_dir: Option, + pub git_sources: BTreeMap, + pub prev_git_sources: BTreeMap, + pub dataflow_descriptor: Descriptor, + pub nodes_on_machine: BTreeSet, + pub uv: bool, +} + #[derive(Debug, serde::Deserialize, serde::Serialize)] pub struct SpawnDataflowNodes { + pub build_id: Option, + pub session_id: SessionId, pub dataflow_id: DataflowId, - pub working_dir: PathBuf, + /// Allows overwriting the base working dir when CLI and daemon are + /// running on the same machine. + /// + /// Must not be used for multi-machine dataflows. + /// + /// Note that nodes with git sources still use a subdirectory of + /// the base working dir. + pub local_working_dir: Option, pub nodes: BTreeMap, pub dataflow_descriptor: Descriptor, pub spawn_nodes: BTreeSet, pub uv: bool, - pub build_only: bool, } diff --git a/libraries/message/src/daemon_to_coordinator.rs b/libraries/message/src/daemon_to_coordinator.rs index 309697be..ccafb0a5 100644 --- a/libraries/message/src/daemon_to_coordinator.rs +++ b/libraries/message/src/daemon_to_coordinator.rs @@ -3,7 +3,9 @@ use std::collections::BTreeMap; pub use crate::common::{ DataMessage, LogLevel, LogMessage, NodeError, NodeErrorCause, NodeExitStatus, Timestamped, }; -use crate::{common::DaemonId, current_crate_version, id::NodeId, versions_compatible, DataflowId}; +use crate::{ + common::DaemonId, current_crate_version, id::NodeId, versions_compatible, BuildId, DataflowId, +}; #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum CoordinatorRequest { @@ -46,6 +48,10 @@ impl DaemonRegisterRequest { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub enum DaemonEvent { + BuildResult { + build_id: BuildId, + result: Result<(), String>, + }, SpawnResult { dataflow_id: DataflowId, result: Result<(), String>, @@ -77,6 +83,7 @@ impl DataflowDaemonResult { #[derive(Debug, serde::Deserialize, serde::Serialize)] pub enum DaemonCoordinatorReply { + TriggerBuildResult(Result<(), String>), TriggerSpawnResult(Result<(), String>), ReloadResult(Result<(), String>), StopResult(Result<(), String>), diff --git a/libraries/message/src/descriptor.rs b/libraries/message/src/descriptor.rs index 02f660d4..ca583c0b 100644 --- a/libraries/message/src/descriptor.rs +++ b/libraries/message/src/descriptor.rs @@ -253,6 +253,12 @@ pub enum NodeSource { }, } +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub enum ResolvedNodeSource { + Local, + GitCommit { repo: String, commit_hash: String }, +} + #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub enum GitRepoRev { Branch(String), diff --git a/libraries/message/src/lib.rs b/libraries/message/src/lib.rs index 9d1870e0..365eab9f 100644 --- a/libraries/message/src/lib.rs +++ b/libraries/message/src/lib.rs @@ -24,9 +24,44 @@ pub mod coordinator_to_cli; pub use arrow_data; pub use arrow_schema; +use uuid::Uuid; pub type DataflowId = uuid::Uuid; +#[derive( + Debug, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, +)] +pub struct SessionId(uuid::Uuid); + +impl SessionId { + pub fn generate() -> Self { + Self(Uuid::new_v4()) + } +} + +impl std::fmt::Display for SessionId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "SessionId({})", self.0) + } +} + +#[derive( + Debug, Clone, Copy, serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash, +)] +pub struct BuildId(uuid::Uuid); + +impl BuildId { + pub fn generate() -> Self { + Self(Uuid::new_v4()) + } +} + +impl std::fmt::Display for BuildId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "BuildId({})", self.0) + } +} + fn current_crate_version() -> semver::Version { let crate_version_raw = env!("CARGO_PKG_VERSION"); From b82b1059dc097fd81187427bc7491391372339b8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 6 Jun 2025 18:27:29 +0200 Subject: [PATCH 054/101] Remove empty module --- libraries/core/src/git.rs | 0 libraries/core/src/lib.rs | 1 - 2 files changed, 1 deletion(-) delete mode 100644 libraries/core/src/git.rs diff --git a/libraries/core/src/git.rs b/libraries/core/src/git.rs deleted file mode 100644 index e69de29b..00000000 diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index f18a6059..90f2c564 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -9,7 +9,6 @@ pub use dora_message::{config, uhlc}; pub mod build; pub mod descriptor; -pub mod git; pub mod metadata; pub mod topics; From 15ee34e8f1d4e3ac4be281b2b78e05d29e488f7b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 6 Jun 2025 18:38:03 +0200 Subject: [PATCH 055/101] Fix: Use new `TracingBuilder` in `multiple_daemons` example --- examples/multiple-daemons/run.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index 4d8a40e1..ecb5794e 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -9,7 +9,7 @@ use dora_message::{ common::DaemonId, coordinator_to_cli::{ControlRequestReply, DataflowIdAndName}, }; -use dora_tracing::set_up_tracing_opts; +use dora_tracing::TracingBuilder; use eyre::{bail, Context}; use std::{ @@ -30,8 +30,9 @@ use uuid::Uuid; #[tokio::main] async fn main() -> eyre::Result<()> { - set_up_tracing_opts("multiple-daemon-runner", Some("debug"), None) - .wrap_err("failed to set up tracing subscriber")?; + TracingBuilder::new("multiple-daemon-runner") + .with_stdout("debug") + .build()?; let root = Path::new(env!("CARGO_MANIFEST_DIR")); std::env::set_current_dir(root.join(file!()).parent().unwrap()) From 3b930f68e02fcd74f0c9d348e4dbc902e834d852 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 6 Jun 2025 19:13:51 +0200 Subject: [PATCH 056/101] Link some additional libraries for C dataflow example --- examples/c-dataflow/run.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index ad484edf..d7615111 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -63,6 +63,7 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] @@ -93,6 +94,7 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-lsynchronization"); clang.arg("-luser32"); clang.arg("-lwinspool"); + clang.arg("-lwinhttp"); clang.arg("-Wl,-nodefaultlib:libcmt"); clang.arg("-D_DLL"); @@ -107,6 +109,7 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-l").arg("pthread"); clang.arg("-l").arg("c"); clang.arg("-l").arg("m"); + clang.arg("-l").arg("z"); } clang.arg("-L").arg(root.join("target").join("debug")); clang @@ -161,6 +164,7 @@ async fn build_c_operator(root: &Path) -> eyre::Result<()> { link.arg("-lsynchronization"); link.arg("-luser32"); link.arg("-lwinspool"); + link.arg("-lwinhttp"); link.arg("-Wl,-nodefaultlib:libcmt"); link.arg("-D_DLL"); From 926dd51486b13d2cf114d0f4aa28a697eb4798f3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 6 Jun 2025 19:49:07 +0200 Subject: [PATCH 057/101] Upgrade `setup-ros` action --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74d33cf5..ed0a2682 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -192,7 +192,7 @@ jobs: # only save caches for `main` branch save-if: ${{ github.ref == 'refs/heads/main' }} - - uses: ros-tooling/setup-ros@v0.6 + - uses: ros-tooling/setup-ros@v0.7 with: required-ros-distributions: humble - run: 'source /opt/ros/humble/setup.bash && echo AMENT_PREFIX_PATH=${AMENT_PREFIX_PATH} >> "$GITHUB_ENV"' From 897074715b271a75c981af760e9489e316502e24 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 11:13:45 +0200 Subject: [PATCH 058/101] Link libz in more examples --- examples/c++-arrow-dataflow/run.rs | 1 + examples/c++-dataflow/run.rs | 1 + examples/c++-ros2-dataflow/run.rs | 1 + 3 files changed, 3 insertions(+) diff --git a/examples/c++-arrow-dataflow/run.rs b/examples/c++-arrow-dataflow/run.rs index 399a73b1..3fe206d1 100644 --- a/examples/c++-arrow-dataflow/run.rs +++ b/examples/c++-arrow-dataflow/run.rs @@ -136,6 +136,7 @@ async fn build_cxx_node( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index 6f966e19..9ee11168 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -157,6 +157,7 @@ async fn build_cxx_node( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] diff --git a/examples/c++-ros2-dataflow/run.rs b/examples/c++-ros2-dataflow/run.rs index 918158c2..0be1f9f4 100644 --- a/examples/c++-ros2-dataflow/run.rs +++ b/examples/c++-ros2-dataflow/run.rs @@ -90,6 +90,7 @@ async fn build_cxx_node( clang.arg("-l").arg("m"); clang.arg("-l").arg("rt"); clang.arg("-l").arg("dl"); + clang.arg("-l").arg("z"); clang.arg("-pthread"); } #[cfg(target_os = "windows")] From a7af34d004dd2281a318fca6511f1f4f69cc7927 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 11:14:06 +0200 Subject: [PATCH 059/101] Don't include uuid when printing DaemonId --- libraries/message/src/common.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/message/src/common.rs b/libraries/message/src/common.rs index 83591811..35eb54f8 100644 --- a/libraries/message/src/common.rs +++ b/libraries/message/src/common.rs @@ -235,9 +235,9 @@ impl DaemonId { impl std::fmt::Display for DaemonId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if let Some(id) = &self.machine_id { - write!(f, "{id}-")?; + write!(f, "{id}")?; } - write!(f, "{}", self.uuid) + Ok(()) } } From 1aadfaaa0ad9836e12d45dd7e9e6daf2a77aac57 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 11:16:36 +0200 Subject: [PATCH 060/101] Link `rpcrt4` on Windows in C example --- examples/c-dataflow/run.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index d7615111..da88f64b 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -95,6 +95,7 @@ async fn build_c_node(root: &Path, name: &str, out_name: &str) -> eyre::Result<( clang.arg("-luser32"); clang.arg("-lwinspool"); clang.arg("-lwinhttp"); + clang.arg("-lrpcrt4"); clang.arg("-Wl,-nodefaultlib:libcmt"); clang.arg("-D_DLL"); @@ -165,6 +166,7 @@ async fn build_c_operator(root: &Path) -> eyre::Result<()> { link.arg("-luser32"); link.arg("-lwinspool"); link.arg("-lwinhttp"); + link.arg("-lrpcrt4"); link.arg("-Wl,-nodefaultlib:libcmt"); link.arg("-D_DLL"); From 829f483007177c87d2c1029b15b6b45c769b1269 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 11:43:37 +0200 Subject: [PATCH 061/101] Make `dora run` error on distributed dataflows --- binaries/cli/src/command/mod.rs | 2 +- binaries/coordinator/src/lib.rs | 31 +++++++++++++++-------- binaries/coordinator/src/run/mod.rs | 11 +++++--- binaries/daemon/src/lib.rs | 9 +++++++ libraries/core/src/descriptor/validate.rs | 12 +++++---- libraries/message/src/descriptor.rs | 12 ++++----- 6 files changed, 50 insertions(+), 27 deletions(-) diff --git a/binaries/cli/src/command/mod.rs b/binaries/cli/src/command/mod.rs index 77654440..1ef93c08 100644 --- a/binaries/cli/src/command/mod.rs +++ b/binaries/cli/src/command/mod.rs @@ -26,7 +26,7 @@ fn local_working_dir( if dataflow_descriptor .nodes .iter() - .all(|n| n.deploy.machine.is_none()) + .all(|n| n.deploy.as_ref().map(|d| d.machine.as_ref()).is_none()) && cli_and_daemon_on_same_machine(coordinator_session)? { Some( diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 7f5f7d3f..6936ea85 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -1197,7 +1197,7 @@ async fn retrieve_logs( let machine_ids: Vec> = nodes .values() .filter(|node| node.id == node_id) - .map(|node| node.deploy.machine.clone()) + .map(|node| node.deploy.as_ref().and_then(|d| d.machine.clone())) .collect(); let machine_id = if let [machine_id] = &machine_ids[..] { @@ -1263,14 +1263,24 @@ async fn build_dataflow( let mut git_sources_by_daemon = git_sources .into_iter() - .into_grouping_map_by(|(id, _)| nodes.get(id).and_then(|n| n.deploy.machine.as_ref())) + .into_grouping_map_by(|(id, _)| { + nodes + .get(id) + .and_then(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())) + }) .collect(); let mut prev_git_sources_by_daemon = prev_git_sources .into_iter() - .into_grouping_map_by(|(id, _)| nodes.get(id).and_then(|n| n.deploy.machine.as_ref())) + .into_grouping_map_by(|(id, _)| { + nodes + .get(id) + .and_then(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())) + }) .collect(); - let nodes_by_daemon = nodes.values().into_group_map_by(|n| &n.deploy.machine); + let nodes_by_daemon = nodes + .values() + .into_group_map_by(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())); let mut daemons = BTreeSet::new(); for (machine, nodes_on_machine) in &nodes_by_daemon { @@ -1283,11 +1293,9 @@ async fn build_dataflow( build_id, session_id, local_working_dir: local_working_dir.clone(), - git_sources: git_sources_by_daemon - .remove(&machine.as_ref()) - .unwrap_or_default(), + git_sources: git_sources_by_daemon.remove(machine).unwrap_or_default(), prev_git_sources: prev_git_sources_by_daemon - .remove(&machine.as_ref()) + .remove(machine) .unwrap_or_default(), dataflow_descriptor: dataflow.clone(), nodes_on_machine, @@ -1298,9 +1306,10 @@ async fn build_dataflow( timestamp: clock.new_timestamp(), })?; - let daemon_id = build_dataflow_on_machine(daemon_connections, machine.as_deref(), &message) - .await - .wrap_err_with(|| format!("failed to build dataflow on machine `{machine:?}`"))?; + let daemon_id = + build_dataflow_on_machine(daemon_connections, machine.map(|s| s.as_str()), &message) + .await + .wrap_err_with(|| format!("failed to build dataflow on machine `{machine:?}`"))?; daemons.insert(daemon_id); } diff --git a/binaries/coordinator/src/run/mod.rs b/binaries/coordinator/src/run/mod.rs index ca89fb87..9edcabd3 100644 --- a/binaries/coordinator/src/run/mod.rs +++ b/binaries/coordinator/src/run/mod.rs @@ -33,7 +33,9 @@ pub(super) async fn spawn_dataflow( let nodes = dataflow.resolve_aliases_and_set_defaults()?; let uuid = Uuid::new_v7(Timestamp::now(NoContext)); - let nodes_by_daemon = nodes.values().into_group_map_by(|n| &n.deploy.machine); + let nodes_by_daemon = nodes + .values() + .into_group_map_by(|n| n.deploy.as_ref().and_then(|d| d.machine.as_ref())); let mut daemons = BTreeSet::new(); for (machine, nodes_on_machine) in &nodes_by_daemon { @@ -57,9 +59,10 @@ pub(super) async fn spawn_dataflow( timestamp: clock.new_timestamp(), })?; - let daemon_id = spawn_dataflow_on_machine(daemon_connections, machine.as_deref(), &message) - .await - .wrap_err_with(|| format!("failed to spawn dataflow on machine `{machine:?}`"))?; + let daemon_id = + spawn_dataflow_on_machine(daemon_connections, machine.map(|m| m.as_str()), &message) + .await + .wrap_err_with(|| format!("failed to spawn dataflow on machine `{machine:?}`"))?; daemons.insert(daemon_id); } diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index cd18b54d..5c686be6 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -174,6 +174,15 @@ impl Daemon { .to_owned(); let descriptor = read_as_descriptor(dataflow_path).await?; + if let Some(node) = descriptor.nodes.iter().find(|n| n.deploy.is_some()) { + eyre::bail!( + "node {} has a `deploy` section, which is not supported in `dora run`\n\n + Instead, you need to spawn a `dora coordinator` and one or more `dora daemon` + instances and then use `dora start`.", + node.id + ) + } + descriptor.check(&working_dir)?; let nodes = descriptor.resolve_aliases_and_set_defaults()?; diff --git a/libraries/core/src/descriptor/validate.rs b/libraries/core/src/descriptor/validate.rs index d0c1d79a..f68979cd 100644 --- a/libraries/core/src/descriptor/validate.rs +++ b/libraries/core/src/descriptor/validate.rs @@ -36,11 +36,13 @@ pub fn check_dataflow( if source_is_url(source) { info!("{source} is a URL."); // TODO: Implement url check. } else if let Some(remote_daemon_id) = remote_daemon_id { - if let Some(machine) = &node.deploy.machine { - if remote_daemon_id.contains(&machine.as_str()) - || coordinator_is_remote - { - info!("skipping path check for remote node `{}`", node.id); + if let Some(deploy) = &node.deploy { + if let Some(machine) = &deploy.machine { + if remote_daemon_id.contains(&machine.as_str()) + || coordinator_is_remote + { + info!("skipping path check for remote node `{}`", node.id); + } } } } else if custom.build.is_some() { diff --git a/libraries/message/src/descriptor.rs b/libraries/message/src/descriptor.rs index ca583c0b..14835308 100644 --- a/libraries/message/src/descriptor.rs +++ b/libraries/message/src/descriptor.rs @@ -23,15 +23,15 @@ pub struct Descriptor { #[serde(default)] pub communication: CommunicationConfig, #[schemars(skip)] - #[serde(default, rename = "_unstable_deploy")] - pub deploy: Deploy, + #[serde(rename = "_unstable_deploy")] + pub deploy: Option, pub nodes: Vec, #[schemars(skip)] #[serde(default, rename = "_unstable_debug")] pub debug: Debug, } -#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(deny_unknown_fields)] pub struct Deploy { pub machine: Option, @@ -58,8 +58,8 @@ pub struct Node { /// Unstable machine deployment configuration #[schemars(skip)] - #[serde(default, rename = "_unstable_deploy")] - pub deploy: Deploy, + #[serde(rename = "_unstable_deploy")] + pub deploy: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub operators: Option, @@ -99,7 +99,7 @@ pub struct ResolvedNode { pub env: Option>, #[serde(default)] - pub deploy: Deploy, + pub deploy: Option, #[serde(flatten)] pub kind: CoreNodeKind, From 9150928b31a3383aa3570de300778fc7c8ea807d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 11:57:15 +0200 Subject: [PATCH 062/101] Fix: Bring back `dora build` call in CI script The `dora run` command no longer builds the dataflow. --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ed0a2682..ee2e5884 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -409,6 +409,7 @@ jobs: # Run Rust queue latency test echo "Running CI Queue Size Latest Data Rust Test" + dora build tests/queue_size_latest_data_rust/dataflow.yaml --uv dora run tests/queue_size_latest_data_rust/dataflow.yaml --uv - name: "Test CLI (C)" From 9d917bdaa9816106d7f266e4e86e52df73aee0a6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 13:04:33 +0200 Subject: [PATCH 063/101] Parse dataflow descriptor lazily in node APIs The dataflow descriptor format still changes often, which led to parse errors. By doing the parsing lazily, this should only affect users of the `dataflow_descriptor` function from now on. --- Cargo.lock | 10 +++++----- Cargo.toml | 1 + apis/python/node/Cargo.toml | 2 +- apis/python/operator/Cargo.toml | 2 +- apis/rust/node/Cargo.toml | 2 +- apis/rust/node/src/node/mod.rs | 15 +++++++++++---- binaries/cli/Cargo.toml | 2 +- binaries/daemon/Cargo.toml | 2 +- binaries/daemon/src/spawn.rs | 3 ++- binaries/runtime/Cargo.toml | 2 +- binaries/runtime/src/lib.rs | 3 ++- libraries/core/Cargo.toml | 2 +- libraries/message/Cargo.toml | 2 +- libraries/message/src/daemon_to_node.rs | 2 +- 14 files changed, 30 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb45da50..a763d686 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3030,7 +3030,7 @@ dependencies = [ "git2", "itertools 0.14.0", "serde_json", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "shared-memory-server", "sysinfo 0.30.13", "tokio", @@ -3161,7 +3161,7 @@ dependencies = [ "futures-concurrency", "futures-timer", "serde_json", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "shared-memory-server", "shared_memory_extended", "tokio", @@ -3212,7 +3212,7 @@ dependencies = [ "futures", "pyo3", "pythonize", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "tokio", ] @@ -3293,7 +3293,7 @@ dependencies = [ "futures", "futures-concurrency", "pyo3", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", ] [[package]] @@ -3419,7 +3419,7 @@ dependencies = [ "libloading 0.7.4", "pyo3", "pythonize", - "serde_yaml 0.8.26", + "serde_yaml 0.9.34+deprecated", "tokio", "tokio-stream", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 541d3297..2b863fe5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -93,6 +93,7 @@ pyo3 = { version = "0.23", features = [ ] } pythonize = "0.23" git2 = { version = "0.18.0", features = ["vendored-openssl"] } +serde_yaml = "0.9.33" [package] name = "dora-examples" diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index c06fbbaa..063d8157 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -21,7 +21,7 @@ dora-node-api = { workspace = true } dora-operator-api-python = { workspace = true } pyo3.workspace = true eyre = "0.6" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } flume = "0.10.14" dora-runtime = { workspace = true, features = ["tracing", "metrics", "python"] } dora-cli = { workspace = true } diff --git a/apis/python/operator/Cargo.toml b/apis/python/operator/Cargo.toml index a96c5987..a65a929d 100644 --- a/apis/python/operator/Cargo.toml +++ b/apis/python/operator/Cargo.toml @@ -14,7 +14,7 @@ repository.workspace = true dora-node-api = { workspace = true } pyo3 = { workspace = true, features = ["eyre", "abi3-py37"] } eyre = "0.6" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } flume = "0.10.14" arrow = { workspace = true, features = ["pyarrow"] } arrow-schema = { workspace = true } diff --git a/apis/rust/node/Cargo.toml b/apis/rust/node/Cargo.toml index a96256f0..d1485b4b 100644 --- a/apis/rust/node/Cargo.toml +++ b/apis/rust/node/Cargo.toml @@ -17,7 +17,7 @@ dora-core = { workspace = true } dora-message = { workspace = true } shared-memory-server = { workspace = true } eyre = "0.6.7" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } tracing = "0.1.33" flume = "0.10.14" bincode = "1.3.3" diff --git a/apis/rust/node/src/node/mod.rs b/apis/rust/node/src/node/mod.rs index 47890d46..af58e536 100644 --- a/apis/rust/node/src/node/mod.rs +++ b/apis/rust/node/src/node/mod.rs @@ -60,7 +60,7 @@ pub struct DoraNode { drop_stream: DropStream, cache: VecDeque, - dataflow_descriptor: Descriptor, + dataflow_descriptor: serde_yaml::Result, warned_unknown_output: BTreeSet, _rt: TokioRuntime, } @@ -200,7 +200,7 @@ impl DoraNode { sent_out_shared_memory: HashMap::new(), drop_stream, cache: VecDeque::new(), - dataflow_descriptor, + dataflow_descriptor: serde_yaml::from_value(dataflow_descriptor), warned_unknown_output: BTreeSet::new(), _rt: rt, }; @@ -449,8 +449,15 @@ impl DoraNode { /// Returns the full dataflow descriptor that this node is part of. /// /// This method returns the parsed dataflow YAML file. - pub fn dataflow_descriptor(&self) -> &Descriptor { - &self.dataflow_descriptor + pub fn dataflow_descriptor(&self) -> eyre::Result<&Descriptor> { + match &self.dataflow_descriptor { + Ok(d) => Ok(d), + Err(err) => eyre::bail!( + "failed to parse dataflow descriptor: {err}\n\n + This might be caused by mismatched version numbers of dora \ + daemon and the dora node API" + ), + } } } diff --git a/binaries/cli/Cargo.toml b/binaries/cli/Cargo.toml index 51e07f54..e806dc60 100644 --- a/binaries/cli/Cargo.toml +++ b/binaries/cli/Cargo.toml @@ -27,7 +27,7 @@ dora-node-api-c = { workspace = true } dora-operator-api-c = { workspace = true } dora-download = { workspace = true } serde = { version = "1.0.136", features = ["derive"] } -serde_yaml = "0.9.11" +serde_yaml = { workspace = true } webbrowser = "0.8.3" serde_json = "1.0.86" termcolor = "1.1.3" diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index fdfd3596..04043b2f 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -31,7 +31,7 @@ dora-tracing = { workspace = true, optional = true } dora-arrow-convert = { workspace = true } dora-node-api = { workspace = true } dora-message = { workspace = true } -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } uuid = { version = "1.7", features = ["v7"] } futures = "0.3.25" shared-memory-server = { workspace = true } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index cf6a3092..7d75b755 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -89,7 +89,8 @@ impl Spawner { node_id: node_id.clone(), run_config: node.kind.run_config(), daemon_communication, - dataflow_descriptor: self.dataflow_descriptor.clone(), + dataflow_descriptor: serde_yaml::to_value(&self.dataflow_descriptor) + .context("failed to serialize dataflow descriptor to YAML")?, dynamic: node.kind.dynamic(), }; diff --git a/binaries/runtime/Cargo.toml b/binaries/runtime/Cargo.toml index 270ba264..73e6c615 100644 --- a/binaries/runtime/Cargo.toml +++ b/binaries/runtime/Cargo.toml @@ -21,7 +21,7 @@ eyre = "0.6.8" futures = "0.3.21" futures-concurrency = "7.1.0" libloading = "0.7.3" -serde_yaml = "0.8.23" +serde_yaml = { workspace = true } tokio = { version = "1.24.2", features = ["full"] } tokio-stream = "0.1.8" # pyo3-abi3 flag allow simpler linking. See: https://pyo3.rs/v0.13.2/building_and_distribution.html diff --git a/binaries/runtime/src/lib.rs b/binaries/runtime/src/lib.rs index ea949bf4..6b3311bc 100644 --- a/binaries/runtime/src/lib.rs +++ b/binaries/runtime/src/lib.rs @@ -43,7 +43,8 @@ pub fn main() -> eyre::Result<()> { .wrap_err("failed to set up tracing subscriber")?; } - let dataflow_descriptor = config.dataflow_descriptor.clone(); + let dataflow_descriptor = serde_yaml::from_value(config.dataflow_descriptor.clone()) + .context("failed to parse dataflow descriptor")?; let operator_definition = if operators.is_empty() { bail!("no operators"); diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index af467ffe..d50765ef 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -13,7 +13,7 @@ repository.workspace = true dora-message = { workspace = true } eyre = "0.6.8" serde = { version = "1.0.136", features = ["derive"] } -serde_yaml = "0.9.11" +serde_yaml = { workspace = true } once_cell = "1.13.0" which = "5.0.0" uuid = { version = "1.7", features = ["serde", "v7"] } diff --git a/libraries/message/Cargo.toml b/libraries/message/Cargo.toml index 7bb3f673..7e4e179f 100644 --- a/libraries/message/Cargo.toml +++ b/libraries/message/Cargo.toml @@ -23,7 +23,7 @@ aligned-vec = { version = "0.5.0", features = ["serde"] } semver = { version = "1.0.23", features = ["serde"] } schemars = "0.8.19" uhlc = "0.5.1" -serde_yaml = "0.9.11" +serde_yaml = { workspace = true } once_cell = "1.13.0" serde-with-expand-env = "1.1.0" bincode = "1.3.3" diff --git a/libraries/message/src/daemon_to_node.rs b/libraries/message/src/daemon_to_node.rs index acc1630e..e0ce466c 100644 --- a/libraries/message/src/daemon_to_node.rs +++ b/libraries/message/src/daemon_to_node.rs @@ -23,7 +23,7 @@ pub struct NodeConfig { pub node_id: NodeId, pub run_config: NodeRunConfig, pub daemon_communication: DaemonCommunication, - pub dataflow_descriptor: Descriptor, + pub dataflow_descriptor: serde_yaml::Value, pub dynamic: bool, } From 01168ddb028df5b74e3e7fc04b8b01c682c17df6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 13:08:37 +0200 Subject: [PATCH 064/101] Link libz in cmake template --- binaries/cli/src/template/c/cmake-template.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/binaries/cli/src/template/c/cmake-template.txt b/binaries/cli/src/template/c/cmake-template.txt index 32cb561f..eafe50da 100644 --- a/binaries/cli/src/template/c/cmake-template.txt +++ b/binaries/cli/src/template/c/cmake-template.txt @@ -64,16 +64,16 @@ link_directories(${dora_link_dirs}) add_executable(talker_1 talker_1/node.c) add_dependencies(talker_1 Dora_c) target_include_directories(talker_1 PRIVATE ${dora_c_include_dir}) -target_link_libraries(talker_1 dora_node_api_c m) +target_link_libraries(talker_1 dora_node_api_c m z) add_executable(talker_2 talker_2/node.c) add_dependencies(talker_2 Dora_c) target_include_directories(talker_2 PRIVATE ${dora_c_include_dir}) -target_link_libraries(talker_2 dora_node_api_c m) +target_link_libraries(talker_2 dora_node_api_c m z) add_executable(listener_1 listener_1/node.c) add_dependencies(listener_1 Dora_c) target_include_directories(listener_1 PRIVATE ${dora_c_include_dir}) -target_link_libraries(listener_1 dora_node_api_c m) +target_link_libraries(listener_1 dora_node_api_c m z) -install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) \ No newline at end of file +install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) From a19ab2d4c9b65e971a2e57fac0fc8fa0f19649e0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 13:31:03 +0200 Subject: [PATCH 065/101] Fix: Handle error in Python `dataflow_descriptor` function --- apis/python/node/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 2d3634cd..18e70c3e 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -230,7 +230,7 @@ impl Node { /// :rtype: dict pub fn dataflow_descriptor(&mut self, py: Python) -> eyre::Result { Ok( - pythonize::pythonize(py, &self.node.get_mut().dataflow_descriptor()) + pythonize::pythonize(py, &self.node.get_mut().dataflow_descriptor()?) .map(|x| x.unbind())?, ) } From 5b292bbcfb0ce8dd682b76f2d259fdad4f1620dc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 13:33:58 +0200 Subject: [PATCH 066/101] Update commit hash in rust-git-dataflow example --- examples/rust-dataflow-git/dataflow.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/rust-dataflow-git/dataflow.yml b/examples/rust-dataflow-git/dataflow.yml index a64b2170..d47771aa 100644 --- a/examples/rust-dataflow-git/dataflow.yml +++ b/examples/rust-dataflow-git/dataflow.yml @@ -1,7 +1,7 @@ nodes: - id: rust-node git: https://github.com/dora-rs/dora.git - rev: 64a2dc9c # pinned commit, update this when changing the message crate + rev: a19ab2d4 # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-node path: target/debug/rust-dataflow-example-node inputs: @@ -11,7 +11,7 @@ nodes: - id: rust-status-node git: https://github.com/dora-rs/dora.git - rev: 64a2dc9c # pinned commit, update this when changing the message crate + rev: a19ab2d4 # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-status-node path: target/debug/rust-dataflow-example-status-node inputs: @@ -22,7 +22,7 @@ nodes: - id: rust-sink git: https://github.com/dora-rs/dora.git - rev: 64a2dc9c # pinned commit, update this when changing the message crate + rev: a19ab2d4 # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-sink path: target/debug/rust-dataflow-example-sink inputs: From 314707bf95cbdb3c09c03daef117076140b035bc Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 13:49:36 +0200 Subject: [PATCH 067/101] Improve log output --- binaries/cli/src/output.rs | 22 ++++++++++------------ libraries/message/src/common.rs | 4 ++-- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/binaries/cli/src/output.rs b/binaries/cli/src/output.rs index ad35ad67..bdffc8d3 100644 --- a/binaries/cli/src/output.rs +++ b/binaries/cli/src/output.rs @@ -3,7 +3,7 @@ use dora_message::common::LogMessage; pub fn print_log_message(log_message: LogMessage) { let LogMessage { - build_id, + build_id: _, dataflow_id, node_id, daemon_id, @@ -21,28 +21,26 @@ pub fn print_log_message(log_message: LogMessage) { other => format!("{other:5}").normal(), }; let dataflow = if let Some(dataflow_id) = dataflow_id { - format!(" dataflow `{dataflow_id}`").cyan() - } else { - String::new().cyan() - }; - let build = if let Some(build_id) = build_id { - format!(" build `{build_id}`").cyan() + format!(" dataflow `{dataflow_id}`\t").cyan() } else { String::new().cyan() }; let daemon = match daemon_id { - Some(id) => format!(" on daemon `{id}`"), - None => " on default daemon".to_string(), + Some(id) => match id.machine_id() { + Some(machine_id) => format!(" on daemon `{machine_id}`\t"), + None => " on default daemon\t".to_string(), + }, + None => " on default daemon\t".to_string(), } .bright_black(); let node = match node_id { - Some(node_id) => format!(" {node_id}").bold(), + Some(node_id) => format!(" {node_id}\t").bold(), None => "".normal(), }; let target = match target { - Some(target) => format!(" {target}").dimmed(), + Some(target) => format!(" {target}\t").dimmed(), None => "".normal(), }; - println!("{level}{build}{dataflow}{daemon}{node}{target}: {message}"); + println!("{level}\t{dataflow}{daemon}{node}{target}: {message}"); } diff --git a/libraries/message/src/common.rs b/libraries/message/src/common.rs index 35eb54f8..83591811 100644 --- a/libraries/message/src/common.rs +++ b/libraries/message/src/common.rs @@ -235,9 +235,9 @@ impl DaemonId { impl std::fmt::Display for DaemonId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if let Some(id) = &self.machine_id { - write!(f, "{id}")?; + write!(f, "{id}-")?; } - Ok(()) + write!(f, "{}", self.uuid) } } From 18571fd7b601576d354bf0dd9735bcf17ea7baa7 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 13:51:54 +0200 Subject: [PATCH 068/101] Add CI script to manually delete buildjet cache There are sometimes issues where we need a full cleanup. --- .github/workflows/delete-buildjet-cache.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .github/workflows/delete-buildjet-cache.yml diff --git a/.github/workflows/delete-buildjet-cache.yml b/.github/workflows/delete-buildjet-cache.yml new file mode 100644 index 00000000..e54c212c --- /dev/null +++ b/.github/workflows/delete-buildjet-cache.yml @@ -0,0 +1,18 @@ +name: Manually Delete BuildJet Cache +on: + workflow_dispatch: + inputs: + cache_key: + description: 'BuildJet Cache Key to Delete' + required: true + type: string + +jobs: + manually-delete-buildjet-cache: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - uses: buildjet/cache-delete@v1 + with: + cache_key: ${{ inputs.cache_key }} From c3c22d160bc55f9842490e31fb0e8f81c4a906b7 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 14:02:44 +0200 Subject: [PATCH 069/101] Link libz in C++ cmake template too --- binaries/cli/src/template/cxx/cmake-template.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/binaries/cli/src/template/cxx/cmake-template.txt b/binaries/cli/src/template/cxx/cmake-template.txt index bd3fe492..7f7ce865 100644 --- a/binaries/cli/src/template/cxx/cmake-template.txt +++ b/binaries/cli/src/template/cxx/cmake-template.txt @@ -70,16 +70,16 @@ link_directories(${dora_link_dirs}) add_executable(talker_1 talker_1/node.cc ${node_bridge}) add_dependencies(talker_1 Dora_cxx) target_include_directories(talker_1 PRIVATE ${dora_cxx_include_dir}) -target_link_libraries(talker_1 dora_node_api_cxx) +target_link_libraries(talker_1 dora_node_api_cxx z) add_executable(talker_2 talker_2/node.cc ${node_bridge}) add_dependencies(talker_2 Dora_cxx) target_include_directories(talker_2 PRIVATE ${dora_cxx_include_dir}) -target_link_libraries(talker_2 dora_node_api_cxx) +target_link_libraries(talker_2 dora_node_api_cxx z) add_executable(listener_1 listener_1/node.cc ${node_bridge}) add_dependencies(listener_1 Dora_cxx) target_include_directories(listener_1 PRIVATE ${dora_cxx_include_dir}) -target_link_libraries(listener_1 dora_node_api_cxx) +target_link_libraries(listener_1 dora_node_api_cxx z) -install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) \ No newline at end of file +install(TARGETS listener_1 talker_1 talker_2 DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/bin) From 14477b8645de91001e4c1307f160e4a0d1512c56 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 14:13:37 +0200 Subject: [PATCH 070/101] Add a `deploy.working_dir` key to dataflow YAML Allows overriding the working directory per node. --- binaries/daemon/src/lib.rs | 16 ++++++++++++++-- libraries/message/src/descriptor.rs | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 5c686be6..97bc56ea 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -1003,8 +1003,14 @@ impl Daemon { .await .wrap_err("failed to clone logger")?; + let mut builder = builder.clone(); + if let Some(node_working_dir) = + node.deploy.as_ref().and_then(|d| d.working_dir.as_deref()) + { + builder.base_working_dir = builder.base_working_dir.join(node_working_dir); + } + match builder - .clone() .build_node( node, git_source, @@ -1157,7 +1163,13 @@ impl Daemon { .await; let node_working_dir = node_working_dirs .get(&node_id) - .unwrap_or(&base_working_dir) + .cloned() + .or_else(|| { + node.deploy + .as_ref() + .and_then(|d| d.working_dir.as_ref().map(|d| base_working_dir.join(d))) + }) + .unwrap_or(base_working_dir.clone()) .clone(); match spawner .clone() diff --git a/libraries/message/src/descriptor.rs b/libraries/message/src/descriptor.rs index 14835308..f6a2ba9c 100644 --- a/libraries/message/src/descriptor.rs +++ b/libraries/message/src/descriptor.rs @@ -35,6 +35,7 @@ pub struct Descriptor { #[serde(deny_unknown_fields)] pub struct Deploy { pub machine: Option, + pub working_dir: Option, } #[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] From 634f5cb5830ef4044714b6e4b5d0d0b2633b545b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 14:17:14 +0200 Subject: [PATCH 071/101] Bump dora-message version to 0.5.0-alpha --- Cargo.lock | 2 +- Cargo.toml | 2 +- libraries/message/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a763d686..072c1323 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3103,7 +3103,7 @@ dependencies = [ [[package]] name = "dora-message" -version = "0.4.4" +version = "0.5.0-alpha" dependencies = [ "aligned-vec", "arrow-data", diff --git a/Cargo.toml b/Cargo.toml index 2b863fe5..5080b5a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,7 +80,7 @@ dora-ros2-bridge = { version = "0.3.11", path = "libraries/extensions/ros2-bridg dora-ros2-bridge-msg-gen = { version = "0.3.11", path = "libraries/extensions/ros2-bridge/msg-gen" } dora-ros2-bridge-python = { path = "libraries/extensions/ros2-bridge/python" } # versioned independently from the other dora crates -dora-message = { version = "0.4.4", path = "libraries/message" } +dora-message = { version = "0.5.0-alpha", path = "libraries/message" } arrow = { version = "54.2.1" } arrow-schema = { version = "54.2.1" } arrow-data = { version = "54.2.1" } diff --git a/libraries/message/Cargo.toml b/libraries/message/Cargo.toml index 7e4e179f..3bcc71fe 100644 --- a/libraries/message/Cargo.toml +++ b/libraries/message/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "dora-message" # versioned separately from the other dora crates -version = "0.4.4" +version = "0.5.0-alpha" edition.workspace = true documentation.workspace = true description.workspace = true From 2a053246925eaaa32bab2214719597a2d9ec6402 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 14:22:17 +0200 Subject: [PATCH 072/101] Fix some warnings --- apis/rust/node/src/node/mod.rs | 3 +-- libraries/core/src/build/git.rs | 13 ++----------- libraries/message/src/daemon_to_node.rs | 2 +- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/apis/rust/node/src/node/mod.rs b/apis/rust/node/src/node/mod.rs index af58e536..7b4a109c 100644 --- a/apis/rust/node/src/node/mod.rs +++ b/apis/rust/node/src/node/mod.rs @@ -158,10 +158,9 @@ impl DoraNode { ), }; - let id = format!("{}/{}", dataflow_id, node_id); - #[cfg(feature = "metrics")] { + let id = format!("{}/{}", dataflow_id, node_id); let monitor_task = async move { if let Err(e) = run_metrics_monitor(id.clone()) .await diff --git a/libraries/core/src/build/git.rs b/libraries/core/src/build/git.rs index f53a5c9e..9e938ad4 100644 --- a/libraries/core/src/build/git.rs +++ b/libraries/core/src/build/git.rs @@ -35,7 +35,7 @@ impl GitManager { prev_commit_hash: Option, target_dir: &Path, ) -> eyre::Result { - let clone_dir = Self::clone_dir_path(&target_dir, &repo_url, &commit_hash)?; + let clone_dir = Self::clone_dir_path(target_dir, &repo_url, &commit_hash)?; if let Some(using) = self.clones_in_use.get(&clone_dir) { if !using.is_empty() { @@ -59,7 +59,7 @@ impl GitManager { } else if let Some(previous_commit_hash) = prev_commit_hash { // we might be able to update a previous clone let prev_clone_dir = - Self::clone_dir_path(&target_dir, &repo_url, &previous_commit_hash)?; + Self::clone_dir_path(target_dir, &repo_url, &previous_commit_hash)?; if self .clones_in_use @@ -271,15 +271,6 @@ enum ReuseOptions { }, } -fn rev_str(rev: &Option) -> String { - match rev { - Some(GitRepoRev::Branch(branch)) => format!(" (branch {branch})"), - Some(GitRepoRev::Tag(tag)) => format!(" (tag {tag})"), - Some(GitRepoRev::Rev(rev)) => format!(" (rev {rev})"), - None => String::new(), - } -} - fn clone_into(repo_addr: Url, clone_dir: &Path) -> eyre::Result { if let Some(parent) = clone_dir.parent() { std::fs::create_dir_all(parent) diff --git a/libraries/message/src/daemon_to_node.rs b/libraries/message/src/daemon_to_node.rs index e0ce466c..75c59bba 100644 --- a/libraries/message/src/daemon_to_node.rs +++ b/libraries/message/src/daemon_to_node.rs @@ -2,7 +2,7 @@ use std::{net::SocketAddr, path::PathBuf}; use crate::{ config::NodeRunConfig, - descriptor::{Descriptor, OperatorDefinition}, + descriptor::OperatorDefinition, id::{DataId, NodeId, OperatorId}, metadata::Metadata, DataflowId, From 0a60e3dbb645c86c5d8dbf6ad1c74c9dcbba5e22 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 14:25:38 +0200 Subject: [PATCH 073/101] Log a warning that git working dir is still unstable and might change --- libraries/core/src/build/mod.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libraries/core/src/build/mod.rs b/libraries/core/src/build/mod.rs index 3995b222..f5afd3b3 100644 --- a/libraries/core/src/build/mod.rs +++ b/libraries/core/src/build/mod.rs @@ -67,7 +67,15 @@ impl Builder { let node_working_dir = match &node.kind { CoreNodeKind::Custom(n) => { let node_working_dir = match git_folder { - Some(git_folder) => git_folder.prepare(logger).await?, + Some(git_folder) => { + let clone_dir = git_folder.prepare(logger).await?; + tracing::warn!( + "using git clone directory as working dir: \ + this behavior is unstable and might change \ + (see https://github.com/dora-rs/dora/pull/901)" + ); + clone_dir + } None => self.base_working_dir, }; From a4382dedd1c3dac3d88a52784a8d2104af2a9df1 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 16:28:06 +0200 Subject: [PATCH 074/101] Fix: Check `exit_when_done` after spawn failures too --- binaries/daemon/src/lib.rs | 74 ++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 97bc56ea..0e8f289f 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -461,10 +461,7 @@ impl Daemon { node_id, event, } => self.handle_node_event(event, dataflow, node_id).await?, - Event::Dora(event) => match self.handle_dora_event(event).await? { - RunStatus::Continue => {} - RunStatus::Exit => break, - }, + Event::Dora(event) => self.handle_dora_event(event).await?, Event::DynamicNode(event) => self.handle_dynamic_node_event(event).await?, Event::HeartbeatInterval => { if let Some(connection) = &mut self.coordinator_connection { @@ -582,6 +579,23 @@ impl Daemon { )?; } } + Event::NodeStopped { + dataflow_id, + node_id, + } => { + if let Some(exit_when_done) = &mut self.exit_when_done { + exit_when_done.remove(&(dataflow_id, node_id)); + if exit_when_done.is_empty() { + tracing::info!( + "exiting daemon because all required dataflows are finished" + ); + break; + } + } + if self.exit_when_all_finished && self.running.is_empty() { + break; + } + } } // warn if event handling took too long -> the main loop should never be blocked for too long @@ -1812,6 +1826,28 @@ impl Daemon { dataflow_id: Uuid, node_id: &NodeId, dynamic_node: bool, + ) -> eyre::Result<()> { + let result = self + .handle_node_stop_inner(dataflow_id, node_id, dynamic_node) + .await; + let _ = self + .events_tx + .send(Timestamped { + inner: Event::NodeStopped { + dataflow_id, + node_id: node_id.clone(), + }, + timestamp: self.clock.new_timestamp(), + }) + .await; + result + } + + async fn handle_node_stop_inner( + &mut self, + dataflow_id: Uuid, + node_id: &NodeId, + dynamic_node: bool, ) -> eyre::Result<()> { let mut logger = self.logger.for_dataflow(dataflow_id); let dataflow = match self.running.get_mut(&dataflow_id) { @@ -1900,7 +1936,7 @@ impl Daemon { Ok(()) } - async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result { + async fn handle_dora_event(&mut self, event: DoraEvent) -> eyre::Result<()> { match event { DoraEvent::Timer { dataflow_id, @@ -1909,11 +1945,11 @@ impl Daemon { } => { let Some(dataflow) = self.running.get_mut(&dataflow_id) else { tracing::warn!("Timer event for unknown dataflow `{dataflow_id}`"); - return Ok(RunStatus::Continue); + return Ok(()); }; let Some(subscribers) = dataflow.timers.get(&interval) else { - return Ok(RunStatus::Continue); + return Ok(()); }; let mut closed = Vec::new(); @@ -1950,7 +1986,7 @@ impl Daemon { } => { let Some(dataflow) = self.running.get_mut(&dataflow_id) else { tracing::warn!("Logs event for unknown dataflow `{dataflow_id}`"); - return Ok(RunStatus::Continue); + return Ok(()); }; let Some(subscribers) = dataflow.mappings.get(&output_id) else { @@ -1959,7 +1995,7 @@ impl Daemon { output_id, dataflow.mappings ); - return Ok(RunStatus::Continue); + return Ok(()); }; let mut closed = Vec::new(); @@ -2082,22 +2118,9 @@ impl Daemon { self.handle_node_stop(dataflow_id, &node_id, dynamic_node) .await?; - - if let Some(exit_when_done) = &mut self.exit_when_done { - exit_when_done.remove(&(dataflow_id, node_id)); - if exit_when_done.is_empty() { - tracing::info!( - "exiting daemon because all required dataflows are finished" - ); - return Ok(RunStatus::Exit); - } - } - if self.exit_when_all_finished && self.running.is_empty() { - return Ok(RunStatus::Exit); - } } } - Ok(RunStatus::Continue) + Ok(()) } fn base_working_dir( @@ -2610,6 +2633,10 @@ pub enum Event { dataflow_id: Uuid, result: eyre::Result<()>, }, + NodeStopped { + dataflow_id: Uuid, + node_id: NodeId, + }, } impl From for Event { @@ -2633,6 +2660,7 @@ impl Event { Event::SpawnNodeResult { .. } => "SpawnNodeResult", Event::BuildDataflowResult { .. } => "BuildDataflowResult", Event::SpawnDataflowResult { .. } => "SpawnDataflowResult", + Event::NodeStopped { .. } => "NodeStopped", } } } From 64ab0d7c42012c1c2e9d0f23766803130fd39319 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 16:29:18 +0200 Subject: [PATCH 075/101] Fix: Build `rust-ros2-dataflow` before running it --- examples/rust-ros2-dataflow/run.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/rust-ros2-dataflow/run.rs b/examples/rust-ros2-dataflow/run.rs index b930a91b..a14dce48 100644 --- a/examples/rust-ros2-dataflow/run.rs +++ b/examples/rust-ros2-dataflow/run.rs @@ -11,12 +11,25 @@ async fn main() -> eyre::Result<()> { .wrap_err("failed to set working dir")?; let dataflow = Path::new("dataflow.yml"); + build_dataflow(dataflow).await?; run_dataflow(dataflow).await?; Ok(()) } +async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { + let cargo = std::env::var("CARGO").unwrap(); + let mut cmd = tokio::process::Command::new(&cargo); + cmd.arg("run"); + cmd.arg("--package").arg("dora-cli"); + cmd.arg("--").arg("build").arg(dataflow); + if !cmd.status().await?.success() { + bail!("failed to build dataflow"); + }; + Ok(()) +} + async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let cargo = std::env::var("CARGO").unwrap(); let mut cmd = tokio::process::Command::new(&cargo); From 215c3ac0910e3842548da532196c733cd5c10ef8 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 10 Jun 2025 16:58:39 +0200 Subject: [PATCH 076/101] Update commit hash in rust-git-dataflow example another time --- examples/rust-dataflow-git/dataflow.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/rust-dataflow-git/dataflow.yml b/examples/rust-dataflow-git/dataflow.yml index d47771aa..cf06ede2 100644 --- a/examples/rust-dataflow-git/dataflow.yml +++ b/examples/rust-dataflow-git/dataflow.yml @@ -1,7 +1,7 @@ nodes: - id: rust-node git: https://github.com/dora-rs/dora.git - rev: a19ab2d4 # pinned commit, update this when changing the message crate + rev: 64ab0d7c # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-node path: target/debug/rust-dataflow-example-node inputs: @@ -11,7 +11,7 @@ nodes: - id: rust-status-node git: https://github.com/dora-rs/dora.git - rev: a19ab2d4 # pinned commit, update this when changing the message crate + rev: 64ab0d7c # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-status-node path: target/debug/rust-dataflow-example-status-node inputs: @@ -22,7 +22,7 @@ nodes: - id: rust-sink git: https://github.com/dora-rs/dora.git - rev: a19ab2d4 # pinned commit, update this when changing the message crate + rev: 64ab0d7c # pinned commit, update this when changing the message crate build: cargo build -p rust-dataflow-example-sink path: target/debug/rust-dataflow-example-sink inputs: From c8c5374e3f53993c5cb13af680bcdf7b9cb3eafa Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 12:07:32 +0200 Subject: [PATCH 077/101] Update `setup-python` version --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee2e5884..b2005b2c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -212,11 +212,11 @@ jobs: source /opt/ros/humble/setup.bash && ros2 run turtlesim turtlesim_node & source /opt/ros/humble/setup.bash && ros2 run examples_rclcpp_minimal_service service_main & cargo run --example rust-ros2-dataflow --features="ros2-examples" - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 if: runner.os != 'Windows' with: python-version: "3.8" - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 if: runner.os == 'Windows' with: python-version: "3.10" @@ -324,7 +324,7 @@ jobs: dora stop --name ci-rust-dynamic --grace-duration 5s dora destroy - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: # TODO: Support Python 3.13 when https://github.com/pytorch/pytorch/issues/130249 is fixed python-version: "3.12" From 3c68d4c52af3a94e7131c30d8408b8d420b63361 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 12:19:12 +0200 Subject: [PATCH 078/101] Debug: Comment out Python `run` function --- Cargo.lock | 1 - apis/python/node/Cargo.toml | 2 +- apis/python/node/src/lib.rs | 18 +++++++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 072c1323..a3dad82c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3201,7 +3201,6 @@ name = "dora-node-api-python" version = "0.3.11" dependencies = [ "arrow", - "dora-cli", "dora-download", "dora-node-api", "dora-operator-api-python", diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index 063d8157..9aaab3ee 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -24,7 +24,7 @@ eyre = "0.6" serde_yaml = { workspace = true } flume = "0.10.14" dora-runtime = { workspace = true, features = ["tracing", "metrics", "python"] } -dora-cli = { workspace = true } +# dora-cli = { workspace = true } dora-download = { workspace = true } arrow = { workspace = true, features = ["pyarrow"] } pythonize = { workspace = true } diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 18e70c3e..83f7c85a 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -375,21 +375,21 @@ pub fn resolve_dataflow(dataflow: String) -> eyre::Result { Ok(dataflow) } -/// Run a Dataflow -/// -/// :rtype: None -#[pyfunction] -#[pyo3(signature = (dataflow_path, uv=None))] -pub fn run(dataflow_path: String, uv: Option) -> eyre::Result<()> { - dora_cli::command::run(dataflow_path, uv.unwrap_or_default()) -} +// /// Run a Dataflow +// /// +// /// :rtype: None +// #[pyfunction] +// #[pyo3(signature = (dataflow_path, uv=None))] +// pub fn run(dataflow_path: String, uv: Option) -> eyre::Result<()> { +// dora_cli::command::run(dataflow_path, uv.unwrap_or_default()) +// } #[pymodule] fn dora(_py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { dora_ros2_bridge_python::create_dora_ros2_bridge_module(&m)?; m.add_function(wrap_pyfunction!(start_runtime, &m)?)?; - m.add_function(wrap_pyfunction!(run, &m)?)?; + // m.add_function(wrap_pyfunction!(run, &m)?)?; m.add_class::()?; m.setattr("__version__", env!("CARGO_PKG_VERSION"))?; m.setattr("__author__", "Dora-rs Authors")?; From 003542c0fa6da55d5fd4a5d76ad8d6a8c235f48c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 12:57:54 +0200 Subject: [PATCH 079/101] Add `build` feature to `dora-core` to make `git2` dependency optional We don't want to include it for e.g. the `dora-node-api-c`. --- binaries/daemon/Cargo.toml | 2 +- libraries/core/Cargo.toml | 7 +++++-- libraries/core/src/lib.rs | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/binaries/daemon/Cargo.toml b/binaries/daemon/Cargo.toml index 04043b2f..423dba86 100644 --- a/binaries/daemon/Cargo.toml +++ b/binaries/daemon/Cargo.toml @@ -24,7 +24,7 @@ tracing = "0.1.36" tracing-opentelemetry = { version = "0.18.0", optional = true } futures-concurrency = "7.1.0" serde_json = "1.0.86" -dora-core = { workspace = true } +dora-core = { workspace = true, features = ["build"] } flume = "0.10.14" dora-download = { workspace = true } dora-tracing = { workspace = true, optional = true } diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index d50765ef..11450c29 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -9,6 +9,9 @@ repository.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +build = ["dep:git2", "dep:url"] + [dependencies] dora-message = { workspace = true } eyre = "0.6.8" @@ -24,6 +27,6 @@ schemars = "0.8.19" serde_json = "1.0.117" log = { version = "0.4.21", features = ["serde"] } dunce = "1.0.5" -url = "2.5.4" -git2 = { workspace = true } itertools = "0.14" +url = { version = "2.5.4", optional = true } +git2 = { workspace = true, optional = true } diff --git a/libraries/core/src/lib.rs b/libraries/core/src/lib.rs index 90f2c564..c45ec613 100644 --- a/libraries/core/src/lib.rs +++ b/libraries/core/src/lib.rs @@ -7,6 +7,7 @@ use std::{ pub use dora_message::{config, uhlc}; +#[cfg(feature = "build")] pub mod build; pub mod descriptor; pub mod metadata; From 735ff9d9a8c8b0eb7ed3cf2314fde137627ee14a Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 13:20:41 +0200 Subject: [PATCH 080/101] Revert "Debug: Comment out Python `run` function" This reverts commit 3c68d4c52af3a94e7131c30d8408b8d420b63361. --- Cargo.lock | 1 + apis/python/node/Cargo.toml | 2 +- apis/python/node/src/lib.rs | 18 +++++++++--------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a3dad82c..072c1323 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3201,6 +3201,7 @@ name = "dora-node-api-python" version = "0.3.11" dependencies = [ "arrow", + "dora-cli", "dora-download", "dora-node-api", "dora-operator-api-python", diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index 9aaab3ee..063d8157 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -24,7 +24,7 @@ eyre = "0.6" serde_yaml = { workspace = true } flume = "0.10.14" dora-runtime = { workspace = true, features = ["tracing", "metrics", "python"] } -# dora-cli = { workspace = true } +dora-cli = { workspace = true } dora-download = { workspace = true } arrow = { workspace = true, features = ["pyarrow"] } pythonize = { workspace = true } diff --git a/apis/python/node/src/lib.rs b/apis/python/node/src/lib.rs index 83f7c85a..18e70c3e 100644 --- a/apis/python/node/src/lib.rs +++ b/apis/python/node/src/lib.rs @@ -375,21 +375,21 @@ pub fn resolve_dataflow(dataflow: String) -> eyre::Result { Ok(dataflow) } -// /// Run a Dataflow -// /// -// /// :rtype: None -// #[pyfunction] -// #[pyo3(signature = (dataflow_path, uv=None))] -// pub fn run(dataflow_path: String, uv: Option) -> eyre::Result<()> { -// dora_cli::command::run(dataflow_path, uv.unwrap_or_default()) -// } +/// Run a Dataflow +/// +/// :rtype: None +#[pyfunction] +#[pyo3(signature = (dataflow_path, uv=None))] +pub fn run(dataflow_path: String, uv: Option) -> eyre::Result<()> { + dora_cli::command::run(dataflow_path, uv.unwrap_or_default()) +} #[pymodule] fn dora(_py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { dora_ros2_bridge_python::create_dora_ros2_bridge_module(&m)?; m.add_function(wrap_pyfunction!(start_runtime, &m)?)?; - // m.add_function(wrap_pyfunction!(run, &m)?)?; + m.add_function(wrap_pyfunction!(run, &m)?)?; m.add_class::()?; m.setattr("__version__", env!("CARGO_PKG_VERSION"))?; m.setattr("__author__", "Dora-rs Authors")?; From de949521f6d76d4669c7f183d95730186893a4d0 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 14:16:27 +0200 Subject: [PATCH 081/101] Add build script with call to `add_extension_module_link_args()` This is required on macOS when using the `extensions-module` feature. See https://pyo3.rs/v0.14.5/building_and_distribution.html#macos for details. --- Cargo.lock | 1 + apis/python/node/Cargo.toml | 3 +++ apis/python/node/build.rs | 3 +++ 3 files changed, 7 insertions(+) create mode 100644 apis/python/node/build.rs diff --git a/Cargo.lock b/Cargo.lock index 072c1323..63e405bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3211,6 +3211,7 @@ dependencies = [ "flume 0.10.14", "futures", "pyo3", + "pyo3-build-config", "pythonize", "serde_yaml 0.9.34+deprecated", "tokio", diff --git a/apis/python/node/Cargo.toml b/apis/python/node/Cargo.toml index 063d8157..f03a4036 100644 --- a/apis/python/node/Cargo.toml +++ b/apis/python/node/Cargo.toml @@ -33,6 +33,9 @@ dora-ros2-bridge-python = { workspace = true } # pyo3_special_method_derive = "0.4.2" tokio = { version = "1.24.2", features = ["rt"] } +[build-dependencies] +pyo3-build-config = "0.23" + [lib] name = "dora" crate-type = ["cdylib"] diff --git a/apis/python/node/build.rs b/apis/python/node/build.rs new file mode 100644 index 00000000..dace4a9b --- /dev/null +++ b/apis/python/node/build.rs @@ -0,0 +1,3 @@ +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} From 8a4ddf4f4d6f2b9e4b210d258e4f3df98fd0c3dd Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 14:25:44 +0200 Subject: [PATCH 082/101] Use UUID v7 instead of v4 for build and session IDs V7 UUIDs include a timestamp and are sortable. This is useful to see which ID is newer. --- libraries/message/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libraries/message/src/lib.rs b/libraries/message/src/lib.rs index 365eab9f..962176f5 100644 --- a/libraries/message/src/lib.rs +++ b/libraries/message/src/lib.rs @@ -24,7 +24,7 @@ pub mod coordinator_to_cli; pub use arrow_data; pub use arrow_schema; -use uuid::Uuid; +use uuid::{Timestamp, Uuid}; pub type DataflowId = uuid::Uuid; @@ -35,7 +35,7 @@ pub struct SessionId(uuid::Uuid); impl SessionId { pub fn generate() -> Self { - Self(Uuid::new_v4()) + Self(Uuid::new_v7(Timestamp::now(uuid::NoContext))) } } @@ -52,7 +52,7 @@ pub struct BuildId(uuid::Uuid); impl BuildId { pub fn generate() -> Self { - Self(Uuid::new_v4()) + Self(Uuid::new_v7(Timestamp::now(uuid::NoContext))) } } From 6b5cc7a0b94ea2bbfad626ba98a37dc6504dab12 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 14:55:41 +0200 Subject: [PATCH 083/101] Set maturin macOS version to `14.5` --- apis/python/node/pyproject.toml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apis/python/node/pyproject.toml b/apis/python/node/pyproject.toml index 33048a3f..8636df49 100644 --- a/apis/python/node/pyproject.toml +++ b/apis/python/node/pyproject.toml @@ -22,3 +22,11 @@ extend-select = [ "D", # pydocstyle "UP", ] + +[tool.maturin.target.x86_64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" + +[tool.maturin.target.aarch64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" From 6071ba71b99a331db0eeb8328b5ec0050eb25674 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 13 Jun 2025 15:13:50 +0200 Subject: [PATCH 084/101] Add pyo3-build-config and macos-deployment-target also for binaries/cli --- Cargo.lock | 1 + binaries/cli/Cargo.toml | 3 +++ binaries/cli/build.rs | 1 + binaries/cli/pyproject.toml | 12 ++++++++++-- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 63e405bd..3847931d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2948,6 +2948,7 @@ dependencies = [ "log", "notify 5.2.0", "pyo3", + "pyo3-build-config", "self-replace", "self_update", "serde", diff --git a/binaries/cli/Cargo.toml b/binaries/cli/Cargo.toml index e806dc60..349b13d3 100644 --- a/binaries/cli/Cargo.toml +++ b/binaries/cli/Cargo.toml @@ -64,6 +64,9 @@ self-replace = "1.5.0" dunce = "1.0.5" git2 = { workspace = true } +[build-dependencies] +pyo3-build-config = "0.23" + [lib] name = "dora_cli" path = "src/lib.rs" diff --git a/binaries/cli/build.rs b/binaries/cli/build.rs index 81caa36d..3672c16f 100644 --- a/binaries/cli/build.rs +++ b/binaries/cli/build.rs @@ -1,4 +1,5 @@ fn main() { + pyo3_build_config::add_extension_module_link_args(); println!( "cargo:rustc-env=TARGET={}", std::env::var("TARGET").unwrap() diff --git a/binaries/cli/pyproject.toml b/binaries/cli/pyproject.toml index 1ef4af39..c2d52457 100644 --- a/binaries/cli/pyproject.toml +++ b/binaries/cli/pyproject.toml @@ -15,6 +15,14 @@ features = ["python", "pyo3/extension-module"] [tool.ruff.lint] extend-select = [ - "D", # pydocstyle - "UP" + "D", # pydocstyle + "UP", ] + +[tool.maturin.target.x86_64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" + +[tool.maturin.target.aarch64-apple-darwin] +# macOS deployment target SDK version +macos-deployment-target = "14.5" From e21520d433f301843c411635e7c1f482bcd0927d Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 13:24:09 +0200 Subject: [PATCH 085/101] Add `dora-session.yaml` to `.gitignore` to avoid accidental commits The file stores local state and should not be commited to git repositories. By adding the file to a `.gitignore` we can avoid this. Creates a new local `.gitignore` file if none exists. Otherwise, it appends the file to the existing `.gitignore` file. --- binaries/cli/src/session.rs | 24 ++++++++++++++++++++++-- examples/rust-dataflow-git/.gitignore | 2 ++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/binaries/cli/src/session.rs b/binaries/cli/src/session.rs index 29609e54..9311fded 100644 --- a/binaries/cli/src/session.rs +++ b/binaries/cli/src/session.rs @@ -44,8 +44,28 @@ impl DataflowSession { pub fn write_out_for_dataflow(&self, dataflow_path: &Path) -> eyre::Result<()> { let session_file = session_file_path(dataflow_path)?; - std::fs::write(session_file, self.serialize()?) + let filename = session_file + .file_name() + .context("session file has no file name")? + .to_str() + .context("session file name is no utf8")?; + std::fs::write(&session_file, self.serialize()?) .context("failed to write dataflow session file")?; + let gitignore = session_file.with_file_name(".gitignore"); + if gitignore.exists() { + let existing = + std::fs::read_to_string(&gitignore).context("failed to read gitignore")?; + if !existing + .lines() + .any(|l| l.split_once('/') == Some(("", filename))) + { + let new = existing + &format!("\n/{filename}\n"); + std::fs::write(gitignore, new).context("failed to update gitignore")?; + } + } else { + std::fs::write(gitignore, format!("/{filename}\n")) + .context("failed to write gitignore")?; + } Ok(()) } @@ -55,7 +75,7 @@ impl DataflowSession { } fn deserialize(session_file: &Path) -> eyre::Result { - std::fs::read_to_string(&session_file) + std::fs::read_to_string(session_file) .context("failed to read DataflowSession file") .and_then(|s| { serde_yaml::from_str(&s).context("failed to deserialize DataflowSession file") diff --git a/examples/rust-dataflow-git/.gitignore b/examples/rust-dataflow-git/.gitignore index dfdc87e3..1ac9aaff 100644 --- a/examples/rust-dataflow-git/.gitignore +++ b/examples/rust-dataflow-git/.gitignore @@ -1,2 +1,4 @@ /build /git + +/dataflow.dora-session.yaml From eb50a368f0e74034ca4c5ccef7abcb11622b6110 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 13:55:32 +0200 Subject: [PATCH 086/101] Fix: Store build results to report them on subsequent `WaitForBuild` messages Quick failures might occur before the CLI even sent the `WaitForBuild` message --- binaries/coordinator/src/lib.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 6936ea85..90d0dfc3 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -204,6 +204,7 @@ async fn start_inner( let mut events = (abortable_events, daemon_events).merge(); let mut running_builds: HashMap = HashMap::new(); + let mut finished_builds: HashMap = HashMap::new(); let mut running_dataflows: HashMap = HashMap::new(); let mut dataflow_results: HashMap> = @@ -449,6 +450,8 @@ async fn start_inner( ControlRequest::WaitForBuild { build_id } => { if let Some(build) = running_builds.get_mut(&build_id) { build.build_result.register(reply_sender); + } else if let Some(result) = finished_builds.get_mut(&build_id) { + result.register(reply_sender); } else { let _ = reply_sender.send(Err(eyre!("unknown build id {build_id}"))); @@ -850,6 +853,8 @@ async fn start_inner( build.build_result.set_result(Ok( ControlRequestReply::DataflowBuildFinished { build_id, result }, )); + + finished_builds.insert(build_id, build.build_result); } } None => { From 4d15cc51bb44286c88b8cbbe89619a6f9a120910 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 14:13:38 +0200 Subject: [PATCH 087/101] Improve log output for local builds --- binaries/cli/src/command/build/local.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/binaries/cli/src/command/build/local.rs b/binaries/cli/src/command/build/local.rs index ac28eeca..78d7152f 100644 --- a/binaries/cli/src/command/build/local.rs +++ b/binaries/cli/src/command/build/local.rs @@ -1,5 +1,6 @@ use std::{collections::BTreeMap, path::PathBuf}; +use colored::Colorize; use dora_core::{ build::{BuildInfo, BuildLogger, Builder, GitManager}, descriptor::{Descriptor, DescriptorExt}, @@ -89,8 +90,15 @@ impl BuildLogger for LocalBuildLogger { type Clone = Self; async fn log_message(&mut self, level: log::Level, message: impl Into + Send) { + let level = match level { + log::Level::Error => "ERROR".red(), + log::Level::Warn => "WARN ".yellow(), + log::Level::Info => "INFO ".green(), + other => format!("{other:5}").normal(), + }; + let node = self.node_id.to_string().bold().bright_black(); let message: String = message.into(); - println!("{}: \t{level}: \t{message}", self.node_id); + println!("{node}: \t{level}: \t{message}"); } async fn try_clone(&self) -> eyre::Result { From de32f92766ecd5f95a36abc928f2390ec0722934 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 14:14:25 +0200 Subject: [PATCH 088/101] Force colored output for build commands See https://bixense.com/clicolors/ --- libraries/core/src/build/build_command.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libraries/core/src/build/build_command.rs b/libraries/core/src/build/build_command.rs index d04d35d3..ef31a6fd 100644 --- a/libraries/core/src/build/build_command.rs +++ b/libraries/core/src/build/build_command.rs @@ -45,6 +45,9 @@ pub fn run_build_command( cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::piped()); + cmd.env("CLICOLOR", "1"); + cmd.env("CLICOLOR_FORCE", "1"); + let mut child = cmd .spawn() .wrap_err_with(|| format!("failed to spawn `{}`", build))?; From 950862b916d9f198dfca06e84f0b44ed5af6f58c Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 19:37:06 +0200 Subject: [PATCH 089/101] Add a special log level for `stdout` output --- binaries/cli/src/command/build/local.rs | 24 ++-- binaries/cli/src/output.rs | 13 ++- binaries/coordinator/src/lib.rs | 2 +- binaries/coordinator/src/log_subscriber.rs | 10 +- binaries/daemon/src/log.rs | 128 +++++++++++++-------- binaries/daemon/src/spawn.rs | 4 +- libraries/core/src/build/logger.rs | 8 +- libraries/core/src/build/mod.rs | 7 +- libraries/message/src/common.rs | 14 ++- 9 files changed, 136 insertions(+), 74 deletions(-) diff --git a/binaries/cli/src/command/build/local.rs b/binaries/cli/src/command/build/local.rs index 78d7152f..32c7b319 100644 --- a/binaries/cli/src/command/build/local.rs +++ b/binaries/cli/src/command/build/local.rs @@ -2,7 +2,7 @@ use std::{collections::BTreeMap, path::PathBuf}; use colored::Colorize; use dora_core::{ - build::{BuildInfo, BuildLogger, Builder, GitManager}, + build::{BuildInfo, BuildLogger, Builder, GitManager, LogLevelOrStdout}, descriptor::{Descriptor, DescriptorExt}, }; use dora_message::{common::GitSource, id::NodeId}; @@ -89,16 +89,24 @@ struct LocalBuildLogger { impl BuildLogger for LocalBuildLogger { type Clone = Self; - async fn log_message(&mut self, level: log::Level, message: impl Into + Send) { - let level = match level { - log::Level::Error => "ERROR".red(), - log::Level::Warn => "WARN ".yellow(), - log::Level::Info => "INFO ".green(), - other => format!("{other:5}").normal(), + async fn log_message( + &mut self, + level: impl Into + Send, + message: impl Into + Send, + ) { + let level = match level.into() { + LogLevelOrStdout::LogLevel(level) => match level { + log::Level::Error => "ERROR ".red(), + log::Level::Warn => "WARN ".yellow(), + log::Level::Info => "INFO ".green(), + log::Level::Debug => "DEBUG ".bright_blue(), + log::Level::Trace => "TRACE ".dimmed(), + }, + LogLevelOrStdout::Stdout => "stdout".italic().dimmed(), }; let node = self.node_id.to_string().bold().bright_black(); let message: String = message.into(); - println!("{node}: \t{level}: \t{message}"); + println!("{node}: {level} {message}"); } async fn try_clone(&self) -> eyre::Result { diff --git a/binaries/cli/src/output.rs b/binaries/cli/src/output.rs index bdffc8d3..76db8c17 100644 --- a/binaries/cli/src/output.rs +++ b/binaries/cli/src/output.rs @@ -1,4 +1,5 @@ use colored::Colorize; +use dora_core::build::LogLevelOrStdout; use dora_message::common::LogMessage; pub fn print_log_message(log_message: LogMessage) { @@ -15,10 +16,14 @@ pub fn print_log_message(log_message: LogMessage) { message, } = log_message; let level = match level { - log::Level::Error => "ERROR".red(), - log::Level::Warn => "WARN ".yellow(), - log::Level::Info => "INFO ".green(), - other => format!("{other:5}").normal(), + LogLevelOrStdout::LogLevel(level) => match level { + log::Level::Error => "ERROR ".red(), + log::Level::Warn => "WARN ".yellow(), + log::Level::Info => "INFO ".green(), + log::Level::Debug => "DEBUG ".bright_blue(), + log::Level::Trace => "TRACE ".dimmed(), + }, + LogLevelOrStdout::Stdout => "stdout".bright_blue().italic().dimmed(), }; let dataflow = if let Some(dataflow_id) = dataflow_id { format!(" dataflow `{dataflow_id}`\t").cyan() diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 90d0dfc3..8e3a4c23 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -369,7 +369,7 @@ async fn start_inner( dataflow_id: Some(dataflow_id), node_id: None, daemon_id: None, - level: LogLevel::Info, + level: LogLevel::Info.into(), target: Some("coordinator".into()), module_path: None, file: None, diff --git a/binaries/coordinator/src/log_subscriber.rs b/binaries/coordinator/src/log_subscriber.rs index cb602d47..e5006616 100644 --- a/binaries/coordinator/src/log_subscriber.rs +++ b/binaries/coordinator/src/log_subscriber.rs @@ -17,9 +17,15 @@ impl LogSubscriber { } pub async fn send_message(&mut self, message: &LogMessage) -> eyre::Result<()> { - if message.level > self.level { - return Ok(()); + match message.level { + dora_core::build::LogLevelOrStdout::LogLevel(level) => { + if level > self.level { + return Ok(()); + } + } + dora_core::build::LogLevelOrStdout::Stdout => {} } + let message = serde_json::to_vec(&message)?; let connection = self.connection.as_mut().context("connection is closed")?; tcp_send(connection, &message) diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index c5fe171a..7092d328 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -4,7 +4,11 @@ use std::{ sync::Arc, }; -use dora_core::{build::BuildLogger, config::NodeId, uhlc}; +use dora_core::{ + build::{BuildLogger, LogLevelOrStdout}, + config::NodeId, + uhlc, +}; use dora_message::{ common::{DaemonId, LogLevel, LogMessage, Timestamped}, daemon_to_coordinator::{CoordinatorRequest, DaemonEvent}, @@ -101,9 +105,19 @@ pub struct NodeBuildLogger<'a> { } impl NodeBuildLogger<'_> { - pub async fn log(&mut self, level: LogLevel, message: impl Into) { + pub async fn log( + &mut self, + level: impl Into + Send, + message: impl Into, + ) { self.logger - .log_build(self.build_id, level, Some(self.node_id.clone()), message) + .log_build( + self.build_id, + level.into(), + None, + Some(self.node_id.clone()), + message, + ) .await } @@ -121,7 +135,7 @@ impl BuildLogger for NodeBuildLogger<'_> { fn log_message( &mut self, - level: LogLevel, + level: impl Into + Send, message: impl Into + Send, ) -> impl std::future::Future + Send { self.log(level, message) @@ -170,7 +184,7 @@ impl DaemonLogger { daemon_id: Some(self.daemon_id.clone()), dataflow_id, node_id, - level, + level: level.into(), target, module_path: None, file: None, @@ -183,7 +197,8 @@ impl DaemonLogger { pub async fn log_build( &mut self, build_id: BuildId, - level: LogLevel, + level: LogLevelOrStdout, + target: Option, node_id: Option, message: impl Into, ) { @@ -193,7 +208,7 @@ impl DaemonLogger { dataflow_id: None, node_id, level, - target: Some("build".into()), + target, module_path: None, file: None, line: None, @@ -249,33 +264,7 @@ impl Logger { // log message using tracing if reporting to coordinator is not possible match message.level { - LogLevel::Error => { - tracing::error!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); - } - LogLevel::Warn => { - tracing::warn!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); - } - LogLevel::Info => { + LogLevelOrStdout::Stdout => { tracing::info!( build_id = ?message.build_id.map(|id| id.to_string()), dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), @@ -286,22 +275,63 @@ impl Logger { line = message.line, "{}", Indent(&message.message) - ); - } - LogLevel::Debug => { - tracing::debug!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); + ) } - _ => {} + LogLevelOrStdout::LogLevel(level) => match level { + LogLevel::Error => { + tracing::error!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Warn => { + tracing::warn!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Info => { + tracing::info!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Debug => { + tracing::debug!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + _ => {} + }, } } diff --git a/binaries/daemon/src/spawn.rs b/binaries/daemon/src/spawn.rs index 7d75b755..2c24f5e4 100644 --- a/binaries/daemon/src/spawn.rs +++ b/binaries/daemon/src/spawn.rs @@ -590,9 +590,9 @@ impl PreparedNode { daemon_id: Some(daemon_id.clone()), dataflow_id: Some(dataflow_id), build_id: None, - level: LogLevel::Info, + level: dora_core::build::LogLevelOrStdout::Stdout, node_id: Some(node_id.clone()), - target: Some("stdout".into()), + target: None, message: formatted, file: None, line: None, diff --git a/libraries/core/src/build/logger.rs b/libraries/core/src/build/logger.rs index d683bcd4..c382b1ac 100644 --- a/libraries/core/src/build/logger.rs +++ b/libraries/core/src/build/logger.rs @@ -1,15 +1,19 @@ use std::future::Future; -use dora_message::common::LogLevel; +pub use dora_message::common::LogLevelOrStdout; pub trait BuildLogger: Send { type Clone: BuildLogger + 'static; fn log_message( &mut self, - level: LogLevel, + level: impl Into + Send, message: impl Into + Send, ) -> impl Future + Send; + fn log_stdout(&mut self, message: impl Into + Send) -> impl Future + Send { + self.log_message(LogLevelOrStdout::Stdout, message) + } + fn try_clone(&self) -> impl Future> + Send; } diff --git a/libraries/core/src/build/mod.rs b/libraries/core/src/build/mod.rs index f5afd3b3..b0449d35 100644 --- a/libraries/core/src/build/mod.rs +++ b/libraries/core/src/build/mod.rs @@ -1,5 +1,5 @@ pub use git::GitManager; -pub use logger::BuildLogger; +pub use logger::{BuildLogger, LogLevelOrStdout}; use url::Url; @@ -126,10 +126,7 @@ async fn build_node( tokio::spawn(async move { while let Some(line) = stdout.recv().await { logger - .log_message( - LogLevel::Info, - line.unwrap_or_else(|err| format!("io err: {}", err.kind())), - ) + .log_stdout(line.unwrap_or_else(|err| format!("io err: {}", err.kind()))) .await; } }); diff --git a/libraries/message/src/common.rs b/libraries/message/src/common.rs index 83591811..d48f1308 100644 --- a/libraries/message/src/common.rs +++ b/libraries/message/src/common.rs @@ -16,7 +16,7 @@ pub struct LogMessage { pub dataflow_id: Option, pub node_id: Option, pub daemon_id: Option, - pub level: LogLevel, + pub level: LogLevelOrStdout, pub target: Option, pub module_path: Option, pub file: Option, @@ -24,6 +24,18 @@ pub struct LogMessage { pub message: String, } +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub enum LogLevelOrStdout { + LogLevel(LogLevel), + Stdout, +} + +impl From for LogLevelOrStdout { + fn from(level: LogLevel) -> Self { + Self::LogLevel(level) + } +} + #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] pub struct NodeError { pub timestamp: uhlc::Timestamp, From 31d7c975fcd2616d304e92366240712915649a76 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 19:37:54 +0200 Subject: [PATCH 090/101] Only print daemon and dataflows IDs if they're useful --- binaries/cli/src/command/build/distributed.rs | 2 +- binaries/cli/src/command/start/attach.rs | 4 +- binaries/cli/src/command/start/mod.rs | 5 ++- binaries/cli/src/output.rs | 37 ++++++++++++------- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/binaries/cli/src/command/build/distributed.rs b/binaries/cli/src/command/build/distributed.rs index 9e7fca67..1fd1ed91 100644 --- a/binaries/cli/src/command/build/distributed.rs +++ b/binaries/cli/src/command/build/distributed.rs @@ -78,7 +78,7 @@ pub fn wait_until_dataflow_built( serde_json::from_slice(&raw).context("failed to parse log message"); match parsed { Ok(log_message) => { - print_log_message(log_message); + print_log_message(log_message, false, true); } Err(err) => { tracing::warn!("failed to parse log message: {err:?}") diff --git a/binaries/cli/src/command/start/attach.rs b/binaries/cli/src/command/start/attach.rs index 05d776e0..a32994d0 100644 --- a/binaries/cli/src/command/start/attach.rs +++ b/binaries/cli/src/command/start/attach.rs @@ -33,6 +33,8 @@ pub fn attach_dataflow( let nodes = dataflow.resolve_aliases_and_set_defaults()?; + let print_daemon_name = nodes.values().any(|n| n.deploy.is_some()); + let working_dir = dataflow_path .canonicalize() .context("failed to canonicalize dataflow path")? @@ -155,7 +157,7 @@ pub fn attach_dataflow( }, Ok(AttachEvent::Control(control_request)) => control_request, Ok(AttachEvent::Log(Ok(log_message))) => { - print_log_message(log_message); + print_log_message(log_message, false, print_daemon_name); continue; } Ok(AttachEvent::Log(Err(err))) => { diff --git a/binaries/cli/src/command/start/mod.rs b/binaries/cli/src/command/start/mod.rs index 5275a62d..72464f86 100644 --- a/binaries/cli/src/command/start/mod.rs +++ b/binaries/cli/src/command/start/mod.rs @@ -56,12 +56,14 @@ pub fn start( log_level, ) } else { + let print_daemon_name = dataflow_descriptor.nodes.iter().any(|n| n.deploy.is_some()); // wait until dataflow is started wait_until_dataflow_started( dataflow_id, &mut session, coordinator_socket, log::LevelFilter::Info, + print_daemon_name, ) } } @@ -120,6 +122,7 @@ fn wait_until_dataflow_started( session: &mut Box, coordinator_addr: SocketAddr, log_level: log::LevelFilter, + print_daemon_id: bool, ) -> eyre::Result<()> { // subscribe to log messages let mut log_session = TcpConnection { @@ -141,7 +144,7 @@ fn wait_until_dataflow_started( serde_json::from_slice(&raw).context("failed to parse log message"); match parsed { Ok(log_message) => { - print_log_message(log_message); + print_log_message(log_message, false, print_daemon_id); } Err(err) => { tracing::warn!("failed to parse log message: {err:?}") diff --git a/binaries/cli/src/output.rs b/binaries/cli/src/output.rs index 76db8c17..ff5ba755 100644 --- a/binaries/cli/src/output.rs +++ b/binaries/cli/src/output.rs @@ -2,7 +2,11 @@ use colored::Colorize; use dora_core::build::LogLevelOrStdout; use dora_message::common::LogMessage; -pub fn print_log_message(log_message: LogMessage) { +pub fn print_log_message( + log_message: LogMessage, + print_dataflow_id: bool, + print_daemon_name: bool, +) { let LogMessage { build_id: _, dataflow_id, @@ -25,27 +29,34 @@ pub fn print_log_message(log_message: LogMessage) { }, LogLevelOrStdout::Stdout => "stdout".bright_blue().italic().dimmed(), }; - let dataflow = if let Some(dataflow_id) = dataflow_id { - format!(" dataflow `{dataflow_id}`\t").cyan() - } else { - String::new().cyan() + + let dataflow = match dataflow_id { + Some(dataflow_id) if print_dataflow_id => format!("dataflow `{dataflow_id}` ").cyan(), + _ => String::new().cyan(), }; let daemon = match daemon_id { - Some(id) => match id.machine_id() { - Some(machine_id) => format!(" on daemon `{machine_id}`\t"), - None => " on default daemon\t".to_string(), + Some(id) if print_daemon_name => match id.machine_id() { + Some(machine_id) => format!("on daemon `{machine_id}`"), + None => "on default daemon ".to_string(), }, - None => " on default daemon\t".to_string(), + None if print_daemon_name => "on default daemon".to_string(), + _ => String::new(), } .bright_black(); + let colon = ":".bright_black().bold(); let node = match node_id { - Some(node_id) => format!(" {node_id}\t").bold(), - None => "".normal(), + Some(node_id) => { + let node_id = node_id.to_string().dimmed().bold(); + let padding = if daemon.is_empty() { "" } else { " " }; + format!("{node_id}{padding}{daemon}{colon} ") + } + None if daemon.is_empty() => "".into(), + None => format!("{daemon}{colon} "), }; let target = match target { - Some(target) => format!(" {target}\t").dimmed(), + Some(target) => format!("{target} ").dimmed(), None => "".normal(), }; - println!("{level}\t{dataflow}{daemon}{node}{target}: {message}"); + println!("{node}{level} {target}{dataflow} {message}"); } From 5993403d2821c6f92fed6549227c1ddc89e45c80 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 19:38:11 +0200 Subject: [PATCH 091/101] Print build_id instead of session_id --- binaries/coordinator/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/binaries/coordinator/src/lib.rs b/binaries/coordinator/src/lib.rs index 8e3a4c23..e3a8d767 100644 --- a/binaries/coordinator/src/lib.rs +++ b/binaries/coordinator/src/lib.rs @@ -1291,7 +1291,7 @@ async fn build_dataflow( for (machine, nodes_on_machine) in &nodes_by_daemon { let nodes_on_machine = nodes_on_machine.iter().map(|n| n.id.clone()).collect(); tracing::debug!( - "Running dataflow build `{session_id}` on machine `{machine:?}` (nodes: {nodes_on_machine:?})" + "Running dataflow build `{build_id}` on machine `{machine:?}` (nodes: {nodes_on_machine:?})" ); let build_command = BuildDataflowNodes { @@ -1318,7 +1318,7 @@ async fn build_dataflow( daemons.insert(daemon_id); } - tracing::info!("successfully triggered dataflow build `{session_id}`",); + tracing::info!("successfully triggered dataflow build `{build_id}`",); Ok(RunningBuild { errors: Vec::new(), From 5eeab6f905742819634877f92f16bd26085822f3 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 19:39:05 +0200 Subject: [PATCH 092/101] Fix: Append session UUID to path, without `SessionId(...)` wrapper Parentheses in directory names confuse some build tools/scripts (e.g. openssl Makefile). --- binaries/daemon/src/lib.rs | 2 +- libraries/message/src/lib.rs | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 0e8f289f..c1e39d2b 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -2146,7 +2146,7 @@ impl Daemon { current_dir().context("failed to get daemon working dir")?; Ok(daemon_working_dir .join("_work") - .join(session_id.to_string())) + .join(session_id.uuid().to_string())) } } } diff --git a/libraries/message/src/lib.rs b/libraries/message/src/lib.rs index 962176f5..e5e2e33f 100644 --- a/libraries/message/src/lib.rs +++ b/libraries/message/src/lib.rs @@ -37,11 +37,9 @@ impl SessionId { pub fn generate() -> Self { Self(Uuid::new_v7(Timestamp::now(uuid::NoContext))) } -} -impl std::fmt::Display for SessionId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "SessionId({})", self.0) + pub fn uuid(&self) -> uuid::Uuid { + self.0 } } From 482a7ecb61dfd81d03b3d91842012a40fd6f31fe Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 19:57:36 +0200 Subject: [PATCH 093/101] Put session file into `out` dir --- binaries/cli/src/session.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/binaries/cli/src/session.rs b/binaries/cli/src/session.rs index 9311fded..9a8ac5b8 100644 --- a/binaries/cli/src/session.rs +++ b/binaries/cli/src/session.rs @@ -49,6 +49,9 @@ impl DataflowSession { .context("session file has no file name")? .to_str() .context("session file name is no utf8")?; + if let Some(parent) = session_file.parent() { + std::fs::create_dir_all(parent).context("failed to create out dir")?; + } std::fs::write(&session_file, self.serialize()?) .context("failed to write dataflow session file")?; let gitignore = session_file.with_file_name(".gitignore"); @@ -88,6 +91,8 @@ fn session_file_path(dataflow_path: &Path) -> eyre::Result { .wrap_err("dataflow path has no file stem")? .to_str() .wrap_err("dataflow file stem is not valid utf-8")?; - let session_file = dataflow_path.with_file_name(format!("{file_stem}.dora-session.yaml")); + let session_file = dataflow_path + .with_file_name("out") + .join(format!("{file_stem}.dora-session.yaml")); Ok(session_file) } From 84084b1475623266487b667ecdba50689261b670 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Wed, 18 Jun 2025 20:00:47 +0200 Subject: [PATCH 094/101] Fix: Don't rename git dir if other nodes of same dataflow still need it --- binaries/cli/src/command/build/local.rs | 8 +++- binaries/daemon/src/lib.rs | 8 +++- libraries/core/src/build/git.rs | 58 +++++++++++++++---------- libraries/core/src/build/mod.rs | 14 +++--- 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/binaries/cli/src/command/build/local.rs b/binaries/cli/src/command/build/local.rs index 32c7b319..7f6c2557 100644 --- a/binaries/cli/src/command/build/local.rs +++ b/binaries/cli/src/command/build/local.rs @@ -2,7 +2,7 @@ use std::{collections::BTreeMap, path::PathBuf}; use colored::Colorize; use dora_core::{ - build::{BuildInfo, BuildLogger, Builder, GitManager, LogLevelOrStdout}, + build::{BuildInfo, BuildLogger, Builder, GitManager, LogLevelOrStdout, PrevGitSource}, descriptor::{Descriptor, DescriptorExt}, }; use dora_message::{common::GitSource, id::NodeId}; @@ -52,13 +52,17 @@ async fn build_dataflow( let node_id = node.id.clone(); let git_source = git_sources.get(&node_id).cloned(); let prev_git_source = prev_git_sources.get(&node_id).cloned(); + let prev_git = prev_git_source.map(|prev_source| PrevGitSource { + still_needed_for_this_build: git_sources.values().any(|s| s == &prev_source), + git_source: prev_source, + }); let task = builder .clone() .build_node( node, git_source, - prev_git_source, + prev_git, LocalBuildLogger { node_id: node_id.clone(), }, diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index c1e39d2b..c108410d 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -2,7 +2,7 @@ use aligned_vec::{AVec, ConstAlign}; use coordinator::CoordinatorEvent; use crossbeam::queue::ArrayQueue; use dora_core::{ - build::{self, BuildInfo, GitManager}, + build::{self, BuildInfo, GitManager, PrevGitSource}, config::{DataId, Input, InputMapping, NodeId, NodeRunConfig, OperatorId}, descriptor::{ read_as_descriptor, CoreNodeKind, Descriptor, DescriptorExt, ResolvedNode, RuntimeNode, @@ -1011,6 +1011,10 @@ impl Daemon { logger.log(LogLevel::Info, "building").await; let git_source = git_sources.get(&node_id).cloned(); let prev_git_source = prev_git_sources.get(&node_id).cloned(); + let prev_git = prev_git_source.map(|prev_source| PrevGitSource { + still_needed_for_this_build: git_sources.values().any(|s| s == &prev_source), + git_source: prev_source, + }); let logger_cloned = logger .try_clone_impl() @@ -1028,7 +1032,7 @@ impl Daemon { .build_node( node, git_source, - prev_git_source, + prev_git, logger_cloned, &mut self.git_manager, ) diff --git a/libraries/core/src/build/git.rs b/libraries/core/src/build/git.rs index 9e938ad4..3a27e4e1 100644 --- a/libraries/core/src/build/git.rs +++ b/libraries/core/src/build/git.rs @@ -1,5 +1,5 @@ -use crate::build::BuildLogger; -use dora_message::{common::LogLevel, descriptor::GitRepoRev, DataflowId, SessionId}; +use crate::build::{BuildLogger, PrevGitSource}; +use dora_message::{common::LogLevel, DataflowId, SessionId}; use eyre::{bail, ContextCompat, WrapErr}; use git2::FetchOptions; use itertools::Itertools; @@ -30,13 +30,19 @@ impl GitManager { pub fn choose_clone_dir( &mut self, session_id: SessionId, - repo_url: Url, + repo: String, commit_hash: String, - prev_commit_hash: Option, + prev_git: Option, target_dir: &Path, ) -> eyre::Result { + let repo_url = Url::parse(&repo).context("failed to parse git repository URL")?; let clone_dir = Self::clone_dir_path(target_dir, &repo_url, &commit_hash)?; + let prev_commit_hash = prev_git + .as_ref() + .filter(|p| p.git_source.repo == repo) + .map(|p| &p.git_source.commit_hash); + if let Some(using) = self.clones_in_use.get(&clone_dir) { if !using.is_empty() { // The directory is currently in use by another dataflow. Rebuilding @@ -58,27 +64,31 @@ impl GitManager { } } else if let Some(previous_commit_hash) = prev_commit_hash { // we might be able to update a previous clone - let prev_clone_dir = - Self::clone_dir_path(target_dir, &repo_url, &previous_commit_hash)?; + let prev_clone_dir = Self::clone_dir_path(target_dir, &repo_url, previous_commit_hash)?; - if self - .clones_in_use - .get(&prev_clone_dir) - .map(|ids| !ids.is_empty()) - .unwrap_or(false) - { - // previous clone is still in use -> we cannot rename it, but we can copy it - ReuseOptions::CopyAndFetch { - from: prev_clone_dir, - target_dir: clone_dir.clone(), - commit_hash, - } - } else if prev_clone_dir.exists() { - // there is an unused previous clone that is not in use -> rename it - ReuseOptions::RenameAndFetch { - from: prev_clone_dir, - target_dir: clone_dir.clone(), - commit_hash, + if prev_clone_dir.exists() { + let still_needed = prev_git + .map(|g| g.still_needed_for_this_build) + .unwrap_or(false); + let used_by_others = self + .clones_in_use + .get(&prev_clone_dir) + .map(|ids| !ids.is_empty()) + .unwrap_or(false); + if still_needed || used_by_others { + // previous clone is still in use -> we cannot rename it, but we can copy it + ReuseOptions::CopyAndFetch { + from: prev_clone_dir, + target_dir: clone_dir.clone(), + commit_hash, + } + } else { + // there is an unused previous clone that is no longer needed -> rename it + ReuseOptions::RenameAndFetch { + from: prev_clone_dir, + target_dir: clone_dir.clone(), + commit_hash, + } } } else { // no existing clone associated with previous build id diff --git a/libraries/core/src/build/mod.rs b/libraries/core/src/build/mod.rs index b0449d35..5e7193d5 100644 --- a/libraries/core/src/build/mod.rs +++ b/libraries/core/src/build/mod.rs @@ -33,19 +33,17 @@ impl Builder { self, node: ResolvedNode, git: Option, - prev_git: Option, + prev_git: Option, mut logger: impl BuildLogger, git_manager: &mut GitManager, ) -> eyre::Result>> { let prepared_git = if let Some(GitSource { repo, commit_hash }) = git { - let repo_url = Url::parse(&repo).context("failed to parse git repository URL")?; let target_dir = self.base_working_dir.join("git"); - let prev_hash = prev_git.filter(|p| p.repo == repo).map(|p| p.commit_hash); let git_folder = git_manager.choose_clone_dir( self.session_id, - repo_url, + repo, commit_hash, - prev_hash, + prev_git, &target_dir, )?; Some(git_folder) @@ -142,3 +140,9 @@ pub struct BuiltNode { pub struct BuildInfo { pub node_working_dirs: BTreeMap, } + +pub struct PrevGitSource { + pub git_source: GitSource, + /// `True` if any nodes of this dataflow still require the source for building. + pub still_needed_for_this_build: bool, +} From 9eac01363760031d74ec07c6f053011eea7dff20 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 20 Jun 2025 10:15:21 +0200 Subject: [PATCH 095/101] Fix clone directory copying --- Cargo.lock | 7 +++++++ binaries/daemon/src/lib.rs | 2 ++ libraries/core/Cargo.toml | 1 + libraries/core/src/build/git.rs | 28 ++++++++++++++++++++++++---- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3847931d..502cde60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2991,6 +2991,7 @@ dependencies = [ "dora-message", "dunce", "eyre", + "fs_extra", "git2", "itertools 0.14.0", "log", @@ -4328,6 +4329,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "fsevent-sys" version = "4.1.0" diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index c108410d..14df635c 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -998,6 +998,8 @@ impl Daemon { base_working_dir, uv, }; + self.git_manager.clear_planned_builds(session_id); + let nodes = dataflow_descriptor.resolve_aliases_and_set_defaults()?; let mut tasks = Vec::new(); diff --git a/libraries/core/Cargo.toml b/libraries/core/Cargo.toml index 11450c29..41f2e112 100644 --- a/libraries/core/Cargo.toml +++ b/libraries/core/Cargo.toml @@ -30,3 +30,4 @@ dunce = "1.0.5" itertools = "0.14" url = { version = "2.5.4", optional = true } git2 = { workspace = true, optional = true } +fs_extra = "1.3.0" diff --git a/libraries/core/src/build/git.rs b/libraries/core/src/build/git.rs index 3a27e4e1..7e06f2e0 100644 --- a/libraries/core/src/build/git.rs +++ b/libraries/core/src/build/git.rs @@ -144,6 +144,10 @@ impl GitManager { let path = path.join(commit_hash); Ok(dunce::simplified(&path).to_owned()) } + + pub fn clear_planned_builds(&mut self, session_id: SessionId) { + self.prepared_builds.remove(&session_id); + } } pub struct GitFolder { @@ -155,7 +159,7 @@ impl GitFolder { pub async fn prepare(self, logger: &mut impl BuildLogger) -> eyre::Result { let GitFolder { reuse } = self; - eprintln!("reuse: {reuse:?}"); + tracing::info!("reuse: {reuse:?}"); let clone_dir = match reuse { ReuseOptions::NewClone { target_dir, @@ -208,9 +212,25 @@ impl GitFolder { target_dir, commit_hash, } => { - tokio::fs::copy(&from, &target_dir) - .await - .context("failed to copy repo clone")?; + let from_clone = from.clone(); + let to = target_dir.clone(); + tokio::task::spawn_blocking(move || { + std::fs::create_dir_all(&to) + .context("failed to create directory for copying git repo")?; + fs_extra::dir::copy( + &from_clone, + &to, + &fs_extra::dir::CopyOptions::new().content_only(true), + ) + .with_context(|| { + format!( + "failed to copy repo clone from `{}` to `{}`", + from_clone.display(), + to.display() + ) + }) + }) + .await??; logger .log_message( From 65ded260c92ab6f60e5baf361f81430c6e1a3ea6 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 20 Jun 2025 16:50:12 +0200 Subject: [PATCH 096/101] Convert `log` records to `tracing` events Ensures that `log::info` etc messages are logged too. --- Cargo.lock | 1 + binaries/cli/Cargo.toml | 1 + binaries/cli/src/command/build/mod.rs | 2 +- binaries/cli/src/lib.rs | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 502cde60..c56d68ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2959,6 +2959,7 @@ dependencies = [ "tokio", "tokio-stream", "tracing", + "tracing-log 0.2.0", "uuid 1.16.0", "webbrowser 0.8.15", ] diff --git a/binaries/cli/Cargo.toml b/binaries/cli/Cargo.toml index 349b13d3..7aa97db6 100644 --- a/binaries/cli/Cargo.toml +++ b/binaries/cli/Cargo.toml @@ -37,6 +37,7 @@ communication-layer-request-reply = { workspace = true } notify = "5.1.0" ctrlc = "3.2.5" tracing = "0.1.36" +tracing-log = "0.2.0" dora-tracing = { workspace = true, optional = true } bat = "0.24.0" dora-daemon = { workspace = true } diff --git a/binaries/cli/src/command/build/mod.rs b/binaries/cli/src/command/build/mod.rs index fff1d452..344709a0 100644 --- a/binaries/cli/src/command/build/mod.rs +++ b/binaries/cli/src/command/build/mod.rs @@ -72,7 +72,7 @@ pub fn build( match build_kind { BuildKind::Local => { - println!("running local build"); + log::info!("running local build"); // use dataflow dir as base working dir let local_working_dir = dunce::canonicalize(&dataflow_path) .context("failed to canonicalize dataflow path")? diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index b06b361e..fce649a3 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -306,6 +306,8 @@ pub fn lib_main(args: Args) { } fn run_cli(args: Args) -> eyre::Result<()> { + tracing_log::LogTracer::init()?; + #[cfg(feature = "tracing")] match &args.command { Command::Daemon { From 4392affe16ab8510e432bea9a9208b1facf5424b Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 20 Jun 2025 16:51:59 +0200 Subject: [PATCH 097/101] Tweak heuristic for doing local or distributed builds --- binaries/cli/src/command/build/mod.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/binaries/cli/src/command/build/mod.rs b/binaries/cli/src/command/build/mod.rs index 344709a0..92a33a8c 100644 --- a/binaries/cli/src/command/build/mod.rs +++ b/binaries/cli/src/command/build/mod.rs @@ -45,25 +45,31 @@ pub fn build( } } - let session = connect_to_coordinator_with_defaults(coordinator_addr, coordinator_port); + let session = || connect_to_coordinator_with_defaults(coordinator_addr, coordinator_port); let build_kind = if force_local { - // user explicitly requested a local build + log::info!("Building locally, as requested through `--force-local`"); + BuildKind::Local + } else if dataflow_descriptor.nodes.iter().all(|n| n.deploy.is_none()) { + log::info!("Building locally because dataflow does not contain any `deploy` sections"); BuildKind::Local } else if coordinator_addr.is_some() || coordinator_port.is_some() { + log::info!("Building through coordinator, using the given cooridnator socket information"); // explicit coordinator address or port set -> there should be a coordinator running BuildKind::ThroughCoordinator { - coordinator_session: session.context("failed to connect to coordinator")?, + coordinator_session: session().context("failed to connect to coordinator")?, } } else { - match session { + match session() { Ok(coordinator_session) => { // we found a local coordinator instance at default port -> use it for building + log::info!("Found local dora coordinator instance -> building through coordinator"); BuildKind::ThroughCoordinator { coordinator_session, } } Err(_) => { + log::warn!("No dora coordinator instance found -> trying a local build"); // no coordinator instance found -> do a local build BuildKind::Local } From bf54c6d706533c9cb23af06c3061b1c446a4e629 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Fri, 20 Jun 2025 16:52:42 +0200 Subject: [PATCH 098/101] Improve log output for `dora run` Send the log messages back to the CLI instead of logging them through tracing at the `daemon` level. --- binaries/cli/src/command/run.rs | 15 ++- binaries/cli/src/lib.rs | 8 +- binaries/daemon/src/lib.rs | 38 +++--- binaries/daemon/src/log.rs | 213 ++++++++++++++++++-------------- 4 files changed, 162 insertions(+), 112 deletions(-) diff --git a/binaries/cli/src/command/run.rs b/binaries/cli/src/command/run.rs index df01d16e..7b1adba7 100644 --- a/binaries/cli/src/command/run.rs +++ b/binaries/cli/src/command/run.rs @@ -1,8 +1,10 @@ -use dora_daemon::Daemon; +use dora_daemon::{flume, Daemon, LogDestination}; use eyre::Context; use tokio::runtime::Builder; -use crate::{handle_dataflow_result, resolve_dataflow, session::DataflowSession}; +use crate::{ + handle_dataflow_result, output::print_log_message, resolve_dataflow, session::DataflowSession, +}; pub fn run(dataflow: String, uv: bool) -> Result<(), eyre::Error> { let dataflow_path = resolve_dataflow(dataflow).context("could not resolve dataflow")?; @@ -12,12 +14,21 @@ pub fn run(dataflow: String, uv: bool) -> Result<(), eyre::Error> { .enable_all() .build() .context("tokio runtime failed")?; + + let (log_tx, log_rx) = flume::bounded(100); + std::thread::spawn(move || { + for message in log_rx { + print_log_message(message, false, false); + } + }); + let result = rt.block_on(Daemon::run_dataflow( &dataflow_path, dataflow_session.build_id, dataflow_session.local_build, dataflow_session.session_id, uv, + LogDestination::Channel { sender: log_tx }, ))?; handle_dataflow_result(result, None) } diff --git a/binaries/cli/src/lib.rs b/binaries/cli/src/lib.rs index fce649a3..9c3cfd41 100644 --- a/binaries/cli/src/lib.rs +++ b/binaries/cli/src/lib.rs @@ -8,7 +8,7 @@ use dora_core::{ DORA_DAEMON_LOCAL_LISTEN_PORT_DEFAULT, }, }; -use dora_daemon::Daemon; +use dora_daemon::{Daemon, LogDestination}; use dora_download::download_file; use dora_message::{ cli_to_coordinator::ControlRequest, @@ -342,7 +342,7 @@ fn run_cli(args: Args) -> eyre::Result<()> { .build() .wrap_err("failed to set up tracing subscriber")?; } - Command::Run { .. } => { + Command::Run { .. } | Command::Build { .. } => { let log_level = std::env::var("RUST_LOG").ok().unwrap_or("info".to_string()); TracingBuilder::new("run") .with_stdout(log_level) @@ -525,7 +525,9 @@ fn run_cli(args: Args) -> eyre::Result<()> { DataflowSession::read_session(&dataflow_path).context("failed to read DataflowSession")?; let result = Daemon::run_dataflow(&dataflow_path, - dataflow_session.build_id, dataflow_session.local_build, dataflow_session.session_id, false).await?; + dataflow_session.build_id, dataflow_session.local_build, dataflow_session.session_id, false, + LogDestination::Tracing, + ).await?; handle_dataflow_result(result, None) } None => { diff --git a/binaries/daemon/src/lib.rs b/binaries/daemon/src/lib.rs index 14df635c..d23dd2b8 100644 --- a/binaries/daemon/src/lib.rs +++ b/binaries/daemon/src/lib.rs @@ -63,6 +63,9 @@ use tokio_stream::{wrappers::ReceiverStream, Stream, StreamExt}; use tracing::{error, warn}; use uuid::{NoContext, Timestamp, Uuid}; +pub use flume; +pub use log::LogDestination; + mod coordinator; mod local_listener; mod log; @@ -146,6 +149,20 @@ impl Daemon { future::Either::Right((events, _)) => events?, } }; + + let log_destination = { + // additional connection for logging + let stream = TcpStream::connect(coordinator_addr) + .await + .wrap_err("failed to connect log to dora-coordinator")?; + stream + .set_nodelay(true) + .wrap_err("failed to set TCP_NODELAY")?; + LogDestination::Coordinator { + coordinator_connection: stream, + } + }; + Self::run_general( (ReceiverStream::new(ctrlc_events), incoming_events).merge(), Some(coordinator_addr), @@ -154,6 +171,7 @@ impl Daemon { clock, Some(remote_daemon_events_tx), Default::default(), + log_destination, ) .await .map(|_| ()) @@ -165,6 +183,7 @@ impl Daemon { local_build: Option, session_id: SessionId, uv: bool, + log_destination: LogDestination, ) -> eyre::Result { let working_dir = dataflow_path .canonicalize() @@ -236,6 +255,7 @@ impl Daemon { } else { Default::default() }, + log_destination, ); let spawn_result = reply_rx @@ -260,6 +280,7 @@ impl Daemon { }) } + #[allow(clippy::too_many_arguments)] async fn run_general( external_events: impl Stream> + Unpin, coordinator_addr: Option, @@ -268,6 +289,7 @@ impl Daemon { clock: Arc, remote_daemon_events_tx: Option>>>, builds: BTreeMap, + log_destination: LogDestination, ) -> eyre::Result { let coordinator_connection = match coordinator_addr { Some(addr) => { @@ -282,20 +304,6 @@ impl Daemon { None => None, }; - // additional connection for logging - let logger_coordinator_connection = match coordinator_addr { - Some(addr) => { - let stream = TcpStream::connect(addr) - .await - .wrap_err("failed to connect log to dora-coordinator")?; - stream - .set_nodelay(true) - .wrap_err("failed to set TCP_NODELAY")?; - Some(stream) - } - None => None, - }; - let zenoh_session = match std::env::var(zenoh::Config::DEFAULT_CONFIG_PATH_ENV) { Ok(path) => { let zenoh_config = zenoh::Config::from_file(&path) @@ -392,7 +400,7 @@ impl Daemon { let (dora_events_tx, dora_events_rx) = mpsc::channel(5); let daemon = Self { logger: Logger { - coordinator_connection: logger_coordinator_connection, + destination: log_destination, daemon_id: daemon_id.clone(), clock: clock.clone(), } diff --git a/binaries/daemon/src/log.rs b/binaries/daemon/src/log.rs index 7092d328..283213c8 100644 --- a/binaries/daemon/src/log.rs +++ b/binaries/daemon/src/log.rs @@ -15,6 +15,7 @@ use dora_message::{ BuildId, }; use eyre::Context; +use flume::Sender; use tokio::net::TcpStream; use uuid::Uuid; @@ -230,7 +231,7 @@ impl DaemonLogger { } pub struct Logger { - pub(super) coordinator_connection: Option, + pub(super) destination: LogDestination, pub(super) daemon_id: DaemonId, pub(super) clock: Arc, } @@ -244,117 +245,145 @@ impl Logger { } pub async fn log(&mut self, message: LogMessage) { - if let Some(connection) = &mut self.coordinator_connection { - let msg = serde_json::to_vec(&Timestamped { - inner: CoordinatorRequest::Event { - daemon_id: self.daemon_id.clone(), - event: DaemonEvent::Log(message.clone()), - }, - timestamp: self.clock.new_timestamp(), - }) - .expect("failed to serialize log message"); - match socket_stream_send(connection, &msg) - .await - .wrap_err("failed to send log message to dora-coordinator") - { - Ok(()) => return, - Err(err) => tracing::warn!("{err:?}"), + match &mut self.destination { + LogDestination::Coordinator { + coordinator_connection, + } => { + let message = Timestamped { + inner: CoordinatorRequest::Event { + daemon_id: self.daemon_id.clone(), + event: DaemonEvent::Log(message.clone()), + }, + timestamp: self.clock.new_timestamp(), + }; + Self::log_to_coordinator(message, coordinator_connection).await } - } - - // log message using tracing if reporting to coordinator is not possible - match message.level { - LogLevelOrStdout::Stdout => { - tracing::info!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ) + LogDestination::Channel { sender } => { + let _ = sender.send_async(message).await; } - LogLevelOrStdout::LogLevel(level) => match level { - LogLevel::Error => { - tracing::error!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); - } - LogLevel::Warn => { - tracing::warn!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); - } - LogLevel::Info => { - tracing::info!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); + LogDestination::Tracing => { + // log message using tracing if reporting to coordinator is not possible + match message.level { + LogLevelOrStdout::Stdout => { + tracing::info!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ) + } + LogLevelOrStdout::LogLevel(level) => match level { + LogLevel::Error => { + tracing::error!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Warn => { + tracing::warn!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Info => { + tracing::info!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + LogLevel::Debug => { + tracing::debug!( + build_id = ?message.build_id.map(|id| id.to_string()), + dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), + node_id = ?message.node_id.map(|id| id.to_string()), + target = message.target, + module_path = message.module_path, + file = message.file, + line = message.line, + "{}", + Indent(&message.message) + ); + } + _ => {} + }, } - LogLevel::Debug => { - tracing::debug!( - build_id = ?message.build_id.map(|id| id.to_string()), - dataflow_id = ?message.dataflow_id.map(|id| id.to_string()), - node_id = ?message.node_id.map(|id| id.to_string()), - target = message.target, - module_path = message.module_path, - file = message.file, - line = message.line, - "{}", - Indent(&message.message) - ); - } - _ => {} - }, + } } } pub async fn try_clone(&self) -> eyre::Result { - let coordinator_connection = match &self.coordinator_connection { - Some(c) => { - let addr = c + let destination = match &self.destination { + LogDestination::Coordinator { + coordinator_connection, + } => { + let addr = coordinator_connection .peer_addr() .context("failed to get coordinator peer addr")?; let new_connection = TcpStream::connect(addr) .await .context("failed to connect to coordinator during logger clone")?; - Some(new_connection) + LogDestination::Coordinator { + coordinator_connection: new_connection, + } } - None => None, + LogDestination::Channel { sender } => LogDestination::Channel { + sender: sender.clone(), + }, + LogDestination::Tracing => LogDestination::Tracing, }; Ok(Self { - coordinator_connection, + destination, daemon_id: self.daemon_id.clone(), clock: self.clock.clone(), }) } + + async fn log_to_coordinator( + message: Timestamped, + connection: &mut TcpStream, + ) { + let msg = serde_json::to_vec(&message).expect("failed to serialize log message"); + match socket_stream_send(connection, &msg) + .await + .wrap_err("failed to send log message to dora-coordinator") + { + Ok(()) => return, + Err(err) => tracing::warn!("{err:?}"), + } + } +} + +pub enum LogDestination { + Coordinator { coordinator_connection: TcpStream }, + Channel { sender: Sender }, + Tracing, } enum CowMut<'a, T> { From f030d8d6b5914d2c3b55f893785198af8c8b37bd Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 23 Jun 2025 10:41:30 +0200 Subject: [PATCH 099/101] Fix typo --- binaries/cli/src/command/build/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binaries/cli/src/command/build/mod.rs b/binaries/cli/src/command/build/mod.rs index 92a33a8c..e0046666 100644 --- a/binaries/cli/src/command/build/mod.rs +++ b/binaries/cli/src/command/build/mod.rs @@ -54,7 +54,7 @@ pub fn build( log::info!("Building locally because dataflow does not contain any `deploy` sections"); BuildKind::Local } else if coordinator_addr.is_some() || coordinator_port.is_some() { - log::info!("Building through coordinator, using the given cooridnator socket information"); + log::info!("Building through coordinator, using the given coordinator socket information"); // explicit coordinator address or port set -> there should be a coordinator running BuildKind::ThroughCoordinator { coordinator_session: session().context("failed to connect to coordinator")?, From 38ff2c8d63c6e06241543ecde7b17c43b0285f53 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Mon, 23 Jun 2025 13:58:10 +0200 Subject: [PATCH 100/101] Run `dora` executable in release mode for examples Avoids stack overflows on Windows --- examples/c++-arrow-dataflow/run.rs | 1 + examples/c++-dataflow/run.rs | 1 + examples/c++-ros2-dataflow/run.rs | 1 + examples/c-dataflow/run.rs | 1 + examples/camera/run.rs | 2 ++ examples/cmake-dataflow/run.rs | 1 + examples/multiple-daemons/run.rs | 2 ++ examples/python-dataflow/run.rs | 2 ++ examples/python-multi-env/run.rs | 2 ++ examples/python-operator-dataflow/run.rs | 2 ++ examples/python-ros2-dataflow/run.rs | 2 ++ examples/rerun-viewer/run.rs | 2 ++ examples/rust-dataflow-git/run.rs | 2 ++ examples/rust-dataflow-url/run.rs | 2 ++ examples/rust-dataflow/run.rs | 2 ++ examples/rust-ros2-dataflow/run.rs | 2 ++ examples/vlm/run.rs | 2 ++ 17 files changed, 29 insertions(+) diff --git a/examples/c++-arrow-dataflow/run.rs b/examples/c++-arrow-dataflow/run.rs index 3fe206d1..a77c4d78 100644 --- a/examples/c++-arrow-dataflow/run.rs +++ b/examples/c++-arrow-dataflow/run.rs @@ -112,6 +112,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/c++-dataflow/run.rs b/examples/c++-dataflow/run.rs index 9ee11168..dd88900a 100644 --- a/examples/c++-dataflow/run.rs +++ b/examples/c++-dataflow/run.rs @@ -133,6 +133,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/c++-ros2-dataflow/run.rs b/examples/c++-ros2-dataflow/run.rs index 0be1f9f4..4af3cd31 100644 --- a/examples/c++-ros2-dataflow/run.rs +++ b/examples/c++-ros2-dataflow/run.rs @@ -155,6 +155,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/c-dataflow/run.rs b/examples/c-dataflow/run.rs index da88f64b..e71d802b 100644 --- a/examples/c-dataflow/run.rs +++ b/examples/c-dataflow/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/camera/run.rs b/examples/camera/run.rs index 9c475c26..94988261 100644 --- a/examples/camera/run.rs +++ b/examples/camera/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/cmake-dataflow/run.rs b/examples/cmake-dataflow/run.rs index 30e3c9d1..b4530f26 100644 --- a/examples/cmake-dataflow/run.rs +++ b/examples/cmake-dataflow/run.rs @@ -61,6 +61,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/multiple-daemons/run.rs b/examples/multiple-daemons/run.rs index ecb5794e..cb558af3 100644 --- a/examples/multiple-daemons/run.rs +++ b/examples/multiple-daemons/run.rs @@ -240,6 +240,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -252,6 +253,7 @@ async fn run_daemon(coordinator: String, machine_id: &str) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--machine-id") diff --git a/examples/python-dataflow/run.rs b/examples/python-dataflow/run.rs index 23b254e2..de96795d 100644 --- a/examples/python-dataflow/run.rs +++ b/examples/python-dataflow/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -52,6 +53,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/python-multi-env/run.rs b/examples/python-multi-env/run.rs index 23b254e2..de96795d 100644 --- a/examples/python-multi-env/run.rs +++ b/examples/python-multi-env/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -52,6 +53,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/python-operator-dataflow/run.rs b/examples/python-operator-dataflow/run.rs index 9c475c26..94988261 100644 --- a/examples/python-operator-dataflow/run.rs +++ b/examples/python-operator-dataflow/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/python-ros2-dataflow/run.rs b/examples/python-ros2-dataflow/run.rs index 23b254e2..de96795d 100644 --- a/examples/python-ros2-dataflow/run.rs +++ b/examples/python-ros2-dataflow/run.rs @@ -44,6 +44,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -52,6 +53,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/rerun-viewer/run.rs b/examples/rerun-viewer/run.rs index 4785ba9b..243db076 100644 --- a/examples/rerun-viewer/run.rs +++ b/examples/rerun-viewer/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); diff --git a/examples/rust-dataflow-git/run.rs b/examples/rust-dataflow-git/run.rs index 490c5c57..855eb85b 100644 --- a/examples/rust-dataflow-git/run.rs +++ b/examples/rust-dataflow-git/run.rs @@ -28,6 +28,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -40,6 +41,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/rust-dataflow-url/run.rs b/examples/rust-dataflow-url/run.rs index 6f511970..158e8ed9 100644 --- a/examples/rust-dataflow-url/run.rs +++ b/examples/rust-dataflow-url/run.rs @@ -23,6 +23,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -35,6 +36,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/rust-dataflow/run.rs b/examples/rust-dataflow/run.rs index 490c5c57..855eb85b 100644 --- a/examples/rust-dataflow/run.rs +++ b/examples/rust-dataflow/run.rs @@ -28,6 +28,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -40,6 +41,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/rust-ros2-dataflow/run.rs b/examples/rust-ros2-dataflow/run.rs index a14dce48..f81a25d5 100644 --- a/examples/rust-ros2-dataflow/run.rs +++ b/examples/rust-ros2-dataflow/run.rs @@ -23,6 +23,7 @@ async fn build_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow); if !cmd.status().await?.success() { bail!("failed to build dataflow"); @@ -35,6 +36,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--") .arg("daemon") .arg("--run-dataflow") diff --git a/examples/vlm/run.rs b/examples/vlm/run.rs index 1ec38c80..742c3818 100644 --- a/examples/vlm/run.rs +++ b/examples/vlm/run.rs @@ -43,6 +43,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("build").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); @@ -51,6 +52,7 @@ async fn run_dataflow(dataflow: &Path) -> eyre::Result<()> { let mut cmd = tokio::process::Command::new(&cargo); cmd.arg("run"); cmd.arg("--package").arg("dora-cli"); + cmd.arg("--release"); cmd.arg("--").arg("run").arg(dataflow).arg("--uv"); if !cmd.status().await?.success() { bail!("failed to run dataflow"); From 798778f00993d972c45641ba054d3afb32fc8af2 Mon Sep 17 00:00:00 2001 From: Philipp Oppermann Date: Tue, 24 Jun 2025 08:12:42 +0200 Subject: [PATCH 101/101] Fix mismatched Python versions in CLI test --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2005b2c..1d93b3ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -342,7 +342,7 @@ jobs: # Test Python template Project dora new test_python_project --lang python --internal-create-with-path-dependencies cd test_python_project - uv venv --seed -p 3.11 + uv venv --seed -p 3.12 uv pip install -e ../apis/python/node uv pip install ruff pytest @@ -372,7 +372,7 @@ jobs: # Run Python Node Example echo "Running Python Node Example" dora up - uv venv --seed -p 3.11 + uv venv --seed -p 3.12 uv pip install -e apis/python/node dora build examples/python-dataflow/dataflow.yml --uv dora start examples/python-dataflow/dataflow.yml --name ci-python --detach --uv