Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm
0

Configure Feed

Select the types of activity you want to include in your feed.

trace everywhere for deadlock debug

+157 -43
+2
jetstream/src/error.rs
··· 46 46 WebSocketCloseFailure, 47 47 #[error("failed to send ping or pong: {0}")] 48 48 PingPongError(#[from] tokio_tungstenite::tungstenite::Error), 49 + #[error("jetstream event receiver closed")] 50 + ReceiverClosedError, 49 51 }
+15 -5
jetstream/src/lib.rs
··· 382 382 websocket_task(dict, ws_stream, send_channel.clone(), &mut last_cursor) 383 383 .await 384 384 { 385 + match e { 386 + JetstreamEventError::ReceiverClosedError => { 387 + log::error!("Jetstream receiver channel closed. Exiting consumer."); 388 + return; 389 + } 390 + _ => {} 391 + } 385 392 log::error!("Jetstream closed after encountering error: {e:?}"); 386 393 } else { 387 394 log::error!("Jetstream connection closed cleanly"); ··· 404 411 405 412 if retry_attempt > 0 { 406 413 // Exponential backoff 407 - let delay = (base_delay_ms * (2_u64.pow(retry_attempt))).min(max_delay_ms); 408 - log::error!("Connection failed, retrying in {delay}ms..."); 414 + let delay = 415 + (base_delay_ms * (2_u64.saturating_pow(retry_attempt))).min(max_delay_ms); 416 + log::error!("Connection failed, retry #{retry_attempt} in {delay}ms..."); 409 417 tokio::time::sleep(Duration::from_millis(delay)).await; 410 418 log::info!("Attempting to reconnect..."); 411 419 } ··· 451 459 log::info!( 452 460 "All receivers for the Jetstream connection have been dropped, closing connection." 453 461 ); 454 - closing_connection = true; 462 + socket_write.close().await?; 463 + return Err(JetstreamEventError::ReceiverClosedError); 455 464 } else if let Some(last) = last_cursor.as_mut() { 456 465 *last = event_cursor; 457 466 } ··· 484 493 // We can assume that all receivers have been dropped, so we can close 485 494 // the connection and exit the task. 486 495 log::info!( 487 - "All receivers for the Jetstream connection have been dropped, closing connection..." 496 + "All receivers for the Jetstream connection have been dropped, closing connection." 488 497 ); 489 - closing_connection = true; 498 + socket_write.close().await?; 499 + return Err(JetstreamEventError::ReceiverClosedError); 490 500 } else if let Some(last) = last_cursor.as_mut() { 491 501 *last = event_cursor; 492 502 }
+27
ufos/readme.md
··· 16 16 cross build --release --target arm-unknown-linux-gnueabihf && scp ../target/arm-unknown-linux-gnueabihf/release/ufos angel-hair.local:ufos 17 17 ``` 18 18 19 + for bonilla (rp4) 20 + 21 + ```bash 22 + cross build --release --target armv7-unknown-linux-gnueabihf && scp ../target/armv7-unknown-linux-gnueabihf/release/ufos pi@bonilla.local:ufos 23 + ``` 24 + 25 + glibc will cause problems when switching between (`GLIBC_2.25` message). clean up (next build will be slowww) 26 + 27 + ```bash 28 + cargo clean 29 + ``` 30 + 19 31 nginx forward proxy for websocket (run this on another host): 20 32 21 33 ```nginx ··· 95 107 ``` 96 108 97 109 try without info-level logs for better perf 110 + 111 + running on bonilla 112 + 113 + ```bash 114 + ./ufos --jetstream us-west-2 --jetstream-force --data /mnt/ufos-data-no-compression-2/ 115 + ``` 116 + 117 + (reusing data dir from angel-hair) 118 + 119 + 120 + ipv6 is having some trouble. but also maybe there's a deadlock somewhere 121 + 122 + ```bash 123 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 124 + ```
+12 -8
ufos/src/consumer.rs
··· 19 19 const MAX_ACCOUNT_REMOVES: usize = 512; // hard limit, total account deletions. actually the least frequent event, but tiny. 20 20 const MAX_BATCHED_COLLECTIONS: usize = 64; // hard limit, MAX_BATCHED_RECORDS applies per collection 21 21 const MIN_BATCH_SPAN_SECS: f64 = 2.; // try to get a bit of rest a bit. 22 - const MAX_BATCH_SPAN_SECS: f64 = 10.; // hard limit of duration from oldest to latest event cursor within a batch, in seconds. 23 - 24 - const SEND_TIMEOUT_S: f64 = 6.; 22 + const MAX_BATCH_SPAN_SECS: f64 = 60.; // hard limit of duration from oldest to latest event cursor within a batch, in seconds. 25 23 26 - const BATCH_QUEUE_SIZE: usize = 32; 27 - // const BATCH_QUEUE_SIZE: usize = 4096; 24 + const SEND_TIMEOUT_S: f64 = 60.; 25 + const BATCH_QUEUE_SIZE: usize = 1024; // 4096 got OOM'd 28 26 29 27 #[derive(Debug)] 30 28 struct Batcher { ··· 38 36 cursor: Option<Cursor>, 39 37 no_compress: bool, 40 38 ) -> anyhow::Result<Receiver<EventBatch>> { 39 + let endpoint = DefaultJetstreamEndpoints::endpoint_or_shortcut(jetstream_endpoint); 40 + if endpoint == jetstream_endpoint { 41 + eprintln!("connecting to jetstream at {endpoint}"); 42 + } else { 43 + eprintln!("connecting to jetstream at {jetstream_endpoint} => {endpoint}"); 44 + } 41 45 let config: JetstreamConfig<serde_json::Value> = JetstreamConfig { 42 - endpoint: DefaultJetstreamEndpoints::endpoint_or_shortcut(jetstream_endpoint), 46 + endpoint, 43 47 compression: if no_compress { 44 48 JetstreamCompression::None 45 49 } else { ··· 124 128 if event_cursor.duration_since(earliest)?.as_secs_f64() > MIN_BATCH_SPAN_SECS 125 129 && self.batch_sender.capacity() == BATCH_QUEUE_SIZE 126 130 { 127 - log::warn!("queue empty: immediately sending batch."); 131 + log::trace!("queue empty: immediately sending batch."); 128 132 if let Err(send_err) = self 129 133 .batch_sender 130 134 .send(mem::take(&mut self.current_batch)) ··· 140 144 // holds up all consumer progress until it can send to the channel 141 145 // use this when the current batch is too full to add more to it 142 146 async fn send_current_batch_now(&mut self) -> anyhow::Result<()> { 143 - log::warn!("attempting to send batch now"); 147 + log::warn!("attempting to send batch now (capacity: {})", self.batch_sender.capacity()); 144 148 self.batch_sender 145 149 .send_timeout( 146 150 mem::take(&mut self.current_batch),
+33 -7
ufos/src/main.rs
··· 30 30 data: PathBuf, 31 31 } 32 32 33 - // #[tokio::main] 34 - #[tokio::main(flavor = "current_thread")] // TODO: move this to config via args 33 + // #[tokio::main(flavor = "current_thread")] // TODO: move this to config via args 34 + #[tokio::main] 35 35 async fn main() -> anyhow::Result<()> { 36 36 env_logger::init(); 37 37 ··· 48 48 println!("starting server with storage..."); 49 49 let serving = server::serve(storage.clone()); 50 50 51 - tokio::select! { 52 - v = serving => eprintln!("serving ended: {v:?}"), 53 - v = storage.receive(batches) => eprintln!("storage consumer ended: {v:?}"), 54 - v = storage.rw_loop() => eprintln!("storage rw-loop ended: {v:?}"), 55 - }; 51 + let t1 = tokio::task::spawn(async { 52 + let r = serving.await; 53 + log::warn!("serving ended with: {r:?}"); 54 + }); 55 + 56 + let t2 = tokio::task::spawn({ 57 + let storage = storage.clone(); 58 + async move { 59 + let r = storage.receive(batches).await; 60 + log::warn!("storage.receive ended with: {r:?}"); 61 + } 62 + }); 63 + 64 + let t3 = tokio::task::spawn(async move { 65 + let r = storage.rw_loop().await; 66 + log::warn!("storage.rw_loop ended with: {r:?}"); 67 + }); 68 + 69 + // tokio::select! { 70 + // // v = serving => eprintln!("serving ended: {v:?}"), 71 + // v = storage.receive(batches) => eprintln!("storage consumer ended: {v:?}"), 72 + // v = storage.rw_loop() => eprintln!("storage rw-loop ended: {v:?}"), 73 + // }; 74 + 75 + log::trace!("tasks running. waiting."); 76 + t1.await?; 77 + log::trace!("serve task ended."); 78 + t2.await?; 79 + log::trace!("storage receive task ended."); 80 + t3.await?; 81 + log::trace!("storage rw task ended."); 56 82 57 83 println!("bye!"); 58 84
+68 -23
ufos/src/store.rs
··· 1 + // use std::sync::{Arc, Mutex}; // BLOCKING mutex -- only in spawn_blocking tasks! 1 2 use crate::db_types::{db_complete, DbBytes, DbStaticStr, EncodingError, StaticStr}; 2 3 use crate::store_types::{ 3 4 ByCollectionKey, ByCollectionValue, ByCursorSeenKey, ByCursorSeenValue, ByIdKey, ByIdValue, ··· 18 19 use tokio::{sync::mpsc::Receiver, time::sleep}; 19 20 20 21 /// Commit the RW batch immediately if this nubmer of events have been read off the mod queue 21 - const MAX_BATCHED_RW_EVENTS: usize = 18; 22 + const MAX_BATCHED_RW_EVENTS: usize = 3; 22 23 23 24 /// Commit the RW batch immediately if this number of records is reached 24 25 /// ··· 58 59 pub struct Storage { 59 60 keyspace: Keyspace, 60 61 partition: PartitionHandle, 62 + // write_lock: Arc<Mutex<()>>, 61 63 } 62 64 63 65 impl Storage { ··· 70 72 Ok(Self { 71 73 keyspace, 72 74 partition, 75 + // write_lock: Arc::new(Mutex::new(())), 73 76 }) 74 77 } 75 78 ··· 108 111 // TODO: see rw_loop: enforce single-thread. 109 112 loop { 110 113 let t_sleep = Instant::now(); 111 - sleep(Duration::from_secs_f64(0.3)).await; // TODO: minimize during replay 114 + sleep(Duration::from_secs_f64(0.8)).await; // TODO: minimize during replay 112 115 let slept_for = t_sleep.elapsed(); 113 116 let queue_size = receiver.len(); 114 117 115 118 if let Some(event_batch) = receiver.recv().await { 119 + log::trace!("write: received write batch"); 116 120 let batch_summary = summarize_batch(&event_batch); 117 121 118 122 let last = event_batch.last_jetstream_cursor.clone(); // TODO: get this from the data. track last in consumer. compute or track first. 119 123 120 124 let keyspace = self.keyspace.clone(); 121 125 let partition = self.partition.clone(); 126 + // let write_lock = self.write_lock.clone(); 122 127 123 128 let writer_t0 = Instant::now(); 129 + log::trace!("spawn_blocking for write batch"); 124 130 tokio::task::spawn_blocking(move || { 125 131 DBWriter { 126 132 keyspace, 127 133 partition, 134 + // write_lock, 128 135 } 129 136 .write_batch(event_batch, last) 130 137 }) 131 138 .await??; 139 + log::trace!("write: back from blocking task, successfully wrote batch"); 132 140 let wrote_for = writer_t0.elapsed(); 133 141 134 142 println!("{batch_summary}, slept {slept_for: <12?}, wrote {wrote_for: <11?}, queue: {queue_size}"); 135 143 } else { 144 + log::error!("store consumer: receive channel failed (dropped/closed?)"); 136 145 anyhow::bail!("receive channel closed"); 137 146 } 138 147 } ··· 145 154 sleep(Duration::from_secs_f64(0.001)).await; // todo: interval rate-limit instead 146 155 let keyspace = self.keyspace.clone(); 147 156 let partition = self.partition.clone(); 157 + log::trace!("rw: spawn blocking for batch..."); 148 158 tokio::task::spawn_blocking(move || -> anyhow::Result<()> { 159 + log::trace!("rw: getting rw cursor..."); 149 160 let mod_cursor = get_static::<ModCursorKey, ModCursorValue>(&partition)? 150 161 .unwrap_or(Cursor::from_start()); 151 162 let range = ModQueueItemKey::new(mod_cursor.clone()).range_to_prefix_end()?; 152 163 153 164 let mut db_batch = keyspace.batch(); 154 165 let mut batched_rw_items = 0; 166 + let mut any_tasks_found = false; 155 167 168 + log::trace!("rw: iterating newer rw items..."); 156 169 for (i, pair) in partition.range(range.clone()).enumerate() { 170 + log::trace!("rw: iterating {i}"); 171 + any_tasks_found = true; 172 + 157 173 if i >= MAX_BATCHED_RW_EVENTS { 158 174 break; 159 175 } ··· 170 186 let mod_value: ModQueueItemValue = 171 187 db_complete::<ModQueueItemStringValue>(&val_bytes)?.try_into()?; 172 188 189 + log::trace!("rw: iterating {i}: sending to batcher {mod_key:?} => {mod_value:?}"); 173 190 batched_rw_items += DBWriter { 174 191 keyspace: keyspace.clone(), 175 192 partition: partition.clone(), 176 193 } 177 194 .write_rw(&mut db_batch, mod_key, mod_value)?; 195 + log::trace!("rw: iterating {i}: back from batcher."); 178 196 179 197 if batched_rw_items >= MAX_BATCHED_RW_ITEMS { 198 + log::trace!("rw: iterating {i}: batch big enough, breaking out."); 180 199 break; 181 200 } 182 201 } 183 202 184 - db_batch.commit()?; 203 + if !any_tasks_found { 204 + log::trace!("rw: skipping batch commit since apparently no items were added (this is normal, skipping is new)"); 205 + return Ok(()); 206 + } 207 + 208 + log::info!("rw: committing rw batch with {batched_rw_items} items (items != total inserts/deletes)..."); 209 + let r = db_batch.commit(); 210 + log::info!("rw: commit result: {r:?}"); 211 + r?; 185 212 Ok(()) 186 213 }) 187 214 .await??; 215 + log::trace!("rw: back from blocking for rw..."); 188 216 } 217 + // log::warn!("exited rw loop (rw task)"); 189 218 } 190 219 191 220 pub async fn get_collection_records( ··· 352 381 if let Some(cursor) = last { 353 382 insert_batch_static::<JetstreamCursorKey>(&mut db_batch, &self.partition, cursor)?; 354 383 } 355 - Ok(db_batch.commit()?) 384 + log::info!("write: committing write batch..."); 385 + let r = db_batch.commit(); 386 + log::info!("write: commit result: {r:?}"); 387 + r?; 388 + Ok(()) 356 389 } 357 390 358 391 fn write_rw( ··· 367 400 368 401 let items_modified = match mod_value { 369 402 ModQueueItemValue::DeleteAccount(did) => { 403 + log::trace!("rw: batcher: delete account..."); 370 404 let (items, finished) = self.delete_account(db_batch, mod_cursor, did)?; 405 + log::trace!("rw: batcher: back from delete account (finished? {finished})"); 371 406 if finished { 372 407 // only remove the queued rw task if we have actually completed its account removal work 373 408 remove_batch::<ModQueueItemKey>(db_batch, &self.partition, mod_key)?; 409 + items + 1 410 + } else { 411 + items 374 412 } 375 - items 376 413 } 377 414 ModQueueItemValue::DeleteRecord(did, collection, rkey) => { 415 + log::trace!("rw: batcher: delete record..."); 416 + let items = self.delete_record(db_batch, mod_cursor, did, collection, rkey)?; 417 + log::trace!("rw: batcher: back from delete record"); 378 418 remove_batch::<ModQueueItemKey>(db_batch, &self.partition, mod_key)?; 379 - self.delete_record(db_batch, mod_cursor, did, collection, rkey)? 419 + items + 1 380 420 } 381 421 ModQueueItemValue::UpdateRecord(did, collection, rkey, record) => { 422 + let items = self.update_record(db_batch, mod_cursor, did, collection, rkey, record)?; 382 423 remove_batch::<ModQueueItemKey>(db_batch, &self.partition, mod_key)?; 383 - self.update_record(db_batch, mod_cursor, did, collection, rkey, record)? 424 + items + 1 384 425 } 385 426 }; 386 427 Ok(items_modified) ··· 423 464 ByIdKey::record_prefix(did, collection.clone(), rkey).to_db_bytes()?; 424 465 425 466 let mut items_removed = 0; 426 - for pair in self.partition.prefix(&key_prefix_bytes) { 467 + 468 + log::trace!("delete_record: iterate over prefix(!)..."); 469 + for (i, pair) in self.partition.prefix(&key_prefix_bytes).enumerate() { 470 + log::trace!("delete_record iter {i}: found"); 427 471 // find all (hopefully 1) 428 472 let (key_bytes, _) = pair?; 429 473 let key = db_complete::<ByIdKey>(&key_bytes)?; 430 474 let found_cursor = key.cursor(); 431 475 if found_cursor > cursor { 432 476 // we are *only* allowed to delete records that came before the record delete event 433 - eprintln!("delete_record: found (and ignoring) newer version(s). key: {key:?}"); 477 + log::trace!("delete_record: found (and ignoring) newer version(s). key: {key:?}"); 434 478 break; 435 479 } 436 480 ··· 445 489 items_removed += 1; 446 490 } 447 491 448 - if items_removed > 1 { 449 - eprintln!("odd, removed {items_removed} records for one record removal:"); 450 - for (i, pair) in self.partition.prefix(&key_prefix_bytes).enumerate() { 451 - // find all (hopefully 1) 452 - let (key_bytes, _) = pair?; 453 - let found_cursor = db_complete::<ByIdKey>(&key_bytes)?.cursor(); 454 - if found_cursor > cursor { 455 - break; 456 - } 492 + // if items_removed > 1 { 493 + // log::trace!("odd, removed {items_removed} records for one record removal:"); 494 + // for (i, pair) in self.partition.prefix(&key_prefix_bytes).enumerate() { 495 + // // find all (hopefully 1) 496 + // let (key_bytes, _) = pair?; 497 + // let found_cursor = db_complete::<ByIdKey>(&key_bytes)?.cursor(); 498 + // if found_cursor > cursor { 499 + // break; 500 + // } 457 501 458 - let key = db_complete::<ByIdKey>(&key_bytes)?; 459 - eprintln!(" {i}: key {key:?}"); 460 - } 461 - } 502 + // let key = db_complete::<ByIdKey>(&key_bytes)?; 503 + // log::trace!(" {i}: key {key:?}"); 504 + // } 505 + // } 462 506 Ok(items_removed) 463 507 } 464 508 ··· 476 520 477 521 let (_, collection, _rkey, found_cursor) = db_complete::<ByIdKey>(&key_bytes)?.into(); 478 522 if found_cursor > cursor { 479 - eprintln!( 523 + log::trace!( 480 524 "delete account: found (and ignoring) newer records than the delete event??" 481 525 ); 482 526 continue; ··· 495 539 return Ok((items_added, false)); // there might be more records but we've done enough for this batch 496 540 } 497 541 } 542 + 498 543 499 544 Ok((items_added, true)) 500 545 }