Skip to content
This repository was archived by the owner on Jul 3, 2023. It is now read-only.

Commit ad48ab8

Browse files
Kixironryzhyk
authored andcommitted
General code cleanup
Addressed some clippy lints, cleaned up some code and added very basic input validation to the gdelt demo
1 parent fafedff commit ad48ab8

File tree

17 files changed

+184
-172
lines changed

17 files changed

+184
-172
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ cached = { version = "0.38.0", optional = true }
7070
features = ["hashbrown", "time-std", "xxhash-xxh3"]
7171

7272
[dev-dependencies]
73-
csv = "1.1.6"
7473
zip = "0.6.2"
7574
tar = "0.4.38"
7675
rand = "0.8.5"
@@ -98,6 +97,7 @@ time = { version = "0.3.14", features = [
9897
itertools = "0.10.5"
9998
serde_json = "1.0.87"
10099
serde_with = "2.0.1"
100+
csv = { git = "https://github.com/ryzhyk/rust-csv.git" }
101101

102102
[profile.bench]
103103
debug = true
@@ -131,7 +131,7 @@ required-features = ["with-csv"]
131131
harness = false
132132

133133
[[bench]]
134-
name = "column_leaf"
134+
name = "column_layer"
135135
harness = false
136136

137137
[[bench]]
@@ -142,6 +142,5 @@ required-features = ["__gdelt"]
142142
# Waiting for bincode 2.0.0 to be released (https://github.com/thomcc/arcstr/pull/45)
143143
[patch.crates-io.arcstr]
144144
git = "https://github.com/gz/arcstr.git"
145-
features = ["bincode"]
146145
rev = "b43120c"
147146
optional = true

benches/column_leaf.rs renamed to benches/column_layer.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ macro_rules! leaf_benches {
8686
b.iter_batched(
8787
|| unsorted.clone(),
8888
|unsorted| {
89-
let mut builder = UnorderedColumnLeafBuilder::new();
89+
let mut builder = UnorderedColumnLayerBuilder::new();
9090
for tuple in unsorted {
9191
builder.push_tuple(tuple);
9292
}
@@ -138,7 +138,7 @@ macro_rules! leaf_benches {
138138
b.iter_batched(
139139
|| (left.cursor(), right.cursor()),
140140
|(left, right)| {
141-
let mut builder = OrderedColumnLeafBuilder::new();
141+
let mut builder = ColumnLayerBuilder::new();
142142
builder.push_merge(left, right);
143143
},
144144
BatchSize::PerIteration,

benches/galen.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,11 +309,11 @@ fn main() -> Result<()> {
309309
let mut csv_writer = csv::WriterBuilder::new().from_writer(file);
310310
if !results_file_already_exists {
311311
csv_writer
312-
.write_record(&["name", "workers", "elapsed"])
312+
.write_record(["name", "workers", "elapsed"])
313313
.expect("failed to write csv header");
314314
}
315315
csv_writer
316-
.write_record(&[
316+
.write_record([
317317
"galen",
318318
args.workers.to_string().as_str(),
319319
elapsed.as_secs_f64().to_string().as_str(),

benches/gdelt/data.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
//! [GKG cookbook]: http://data.gdeltproject.org/documentation/GDELT-Global_Knowledge_Graph_Codebook-V2.1.pdf
5858
5959
use arcstr::{literal, ArcStr};
60+
use bincode::{Decode, Encode};
6061
use csv::{ReaderBuilder, Trim};
6162
use dbsp::CollectionHandle;
6263
use hashbrown::{HashMap, HashSet};
@@ -83,7 +84,7 @@ type Interner = HashSet<ArcStr, Xxh3Builder>;
8384
type Invalid = HashSet<&'static str, Xxh3Builder>;
8485
type Normalizations = HashMap<&'static str, &'static [ArcStr], Xxh3Builder>;
8586

86-
#[derive(Debug, Clone, SizeOf)]
87+
#[derive(Debug, Clone, SizeOf, Decode, Encode)]
8788
pub struct PersonalNetworkGkgEntry {
8889
pub id: ArcStr,
8990
pub date: u64,

benches/gdelt/main.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,14 @@ fn main() {
6969
.map(NonZeroUsize::get)
7070
.unwrap_or(1);
7171
let batches = args.batches.get();
72-
let person = ArcStr::from(args.person);
72+
let person = ArcStr::from(args.person.trim().to_lowercase());
73+
74+
if let Some((start, end)) = args.date_start.zip(args.date_end) {
75+
if start > end {
76+
eprintln!("error: `--date-start` must be less than than or equal to `--date-end` ({start} > {end})");
77+
return;
78+
}
79+
}
7380

7481
let (mut handle, mut entries) = Runtime::init_circuit(threads, move |circuit| {
7582
let (events, handle) = circuit.add_input_zset();

benches/ldbc-graphalytics/data.rs

Lines changed: 42 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@ use dbsp::{
66
Circuit, OrdIndexedZSet, OrdZSet, Stream,
77
};
88
use indicatif::{HumanBytes, ProgressBar, ProgressState, ProgressStyle};
9+
use reqwest::header::CONTENT_LENGTH;
910
use std::{
1011
cmp::Reverse,
12+
collections::HashMap,
1113
ffi::OsStr,
1214
fmt::{self, Debug},
1315
fs::{self, File, OpenOptions},
@@ -115,8 +117,37 @@ pub(crate) fn list_downloaded_benchmarks() {
115117
}
116118

117119
pub(crate) fn list_datasets() {
120+
let cache_file = Path::new(DATA_PATH).join("dataset_cache.json");
121+
let dataset_sizes = if cache_file.exists() {
122+
serde_json::from_reader(File::open(&cache_file).unwrap()).unwrap_or_default()
123+
} else {
124+
let mut sizes = HashMap::with_capacity(DataSet::DATASETS.len());
125+
126+
// TODO: Realistically we should be doing all of these requests in parallel but
127+
// I don't feel like adding tokio as a direct dependency at the moment (it's
128+
// already a transitive dependency so it doesn't *really* matter, I'm just lazy)
129+
let client = reqwest::blocking::Client::new();
130+
for dataset in DataSet::DATASETS {
131+
if let Ok(response) = client.head(dataset.url).send() {
132+
if let Some(length) = response.headers()[CONTENT_LENGTH]
133+
.to_str()
134+
.ok()
135+
.and_then(|len| len.parse::<u64>().ok())
136+
{
137+
sizes.insert(dataset.name.to_owned(), length);
138+
}
139+
}
140+
}
141+
142+
fs::create_dir_all(DATA_PATH).unwrap();
143+
let cache_file = BufWriter::new(File::create(&cache_file).unwrap());
144+
serde_json::to_writer_pretty(cache_file, &sizes).unwrap();
145+
146+
sizes
147+
};
148+
118149
let mut datasets = DataSet::DATASETS.to_vec();
119-
datasets.sort_by_key(|dataset| dataset.scale);
150+
datasets.sort_by_key(|dataset| (dataset.scale, dataset_sizes.get(dataset.name).copied()));
120151

121152
let longest_name = datasets
122153
.iter()
@@ -126,12 +157,18 @@ pub(crate) fn list_datasets() {
126157

127158
let mut stdout = io::stdout().lock();
128159
for dataset in datasets {
129-
writeln!(
160+
write!(
130161
stdout,
131-
"{:<longest_name$} scale: {:?}",
162+
"{:<longest_name$} scale: {:?} archive size: ",
132163
dataset.name, dataset.scale,
133164
)
134165
.unwrap();
166+
167+
if let Some(&length) = dataset_sizes.get(dataset.name) {
168+
writeln!(stdout, "{}", HumanBytes(length)).unwrap();
169+
} else {
170+
writeln!(stdout, "???").unwrap();
171+
}
135172
}
136173

137174
stdout.flush().unwrap();
@@ -340,47 +377,6 @@ impl DataSet {
340377

341378
Ok(data_path)
342379
}
343-
344-
// Urls are hosted with faster download speeds here:
345-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/cit-Patents.tar.zst
346-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/com-friendster.tar.zst
347-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_5-fb.tar.zst
348-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_6-fb.tar.zst
349-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_7-zf.tar.zst
350-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_8-zf.tar.zst
351-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_9-fb.tar.zst
352-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_0-fb.tar.zst
353-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_1-fb.tar.zst
354-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_2-zf.tar.zst
355-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_3-zf.tar.zst
356-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_4-fb.tar.zst
357-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_5-fb.tar.zst
358-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_6-fb.tar.zst
359-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_7-zf.tar.zst
360-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_8-zf.tar.zst
361-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_9-fb.tar.zst
362-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_0-fb.tar.zst
363-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_1-fb.tar.zst
364-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_2-zf.tar.zst
365-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_3-zf.tar.zst
366-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_4-fb.tar.zst
367-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-sf10k-fb.tar.zst
368-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-sf3k-fb.tar.zst
369-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/dota-league.tar.zst
370-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/example-directed.tar.zst
371-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/example-undirected.tar.zst
372-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-22.tar.zst
373-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-23.tar.zst
374-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-24.tar.zst
375-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-25.tar.zst
376-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-26.tar.zst
377-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-27.tar.zst
378-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-28.tar.zst
379-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-29.tar.zst
380-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-30.tar.zst
381-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/kgs.tar.zst
382-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/twitter_mpi.tar.zst
383-
// https://r2-public-worker.ldbc.workers.dev/graphalytics/wiki-Talk.tar.zst
384380
}
385381

386382
macro_rules! datasets {
@@ -429,7 +425,7 @@ datasets! {
429425
DATAGEN_9_4 = "datagen-9_4-fb" @ XL,
430426

431427
DATAGEN_SF3K = "datagen-sf3k-fb" @ XL,
432-
// There's also datagen-sf10k-fb but it requires downloading 2 files
428+
DATAGEN_SF10K = "datagen-sf10k-fb" @ XL,
433429

434430
GRAPH_500_22 = "graph500-22" @ S,
435431
GRAPH_500_23 = "graph500-23" @ M,
@@ -439,7 +435,7 @@ datasets! {
439435
GRAPH_500_27 = "graph500-27" @ XL,
440436
GRAPH_500_28 = "graph500-28" @ XXL,
441437
GRAPH_500_29 = "graph500-29" @ XXL,
442-
// There's also graph500-30 but it's massive and requires downloading 4 files
438+
GRAPH_500_30 = "graph500-30" @ XXL,
443439

444440
KGS = "kgs" @ XS,
445441
WIKI_TALK = "wiki-Talk" @ XXS,

benches/ldbc-graphalytics/main.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ fn main() {
259259

260260
if !results_file_already_exists {
261261
// Write a header row if the file is newly created
262-
csv_writer.write_record(&[
262+
csv_writer.write_record([
263263
"name",
264264
"algorithm",
265265
"dataset",
@@ -283,9 +283,10 @@ fn main() {
283283
"allocstats_after_current_commit",
284284
"allocstats_after_peak_commit",
285285
"allocstats_after_page_faults"
286-
]).expect("failed to write csv header");
286+
])
287+
.expect("failed to write csv header");
287288
}
288-
csv_writer.write_record(&[
289+
csv_writer.write_record([
289290
"ldbc",
290291
args.algorithm(),
291292
config.dataset.name,
@@ -309,7 +310,8 @@ fn main() {
309310
stats.current_commit.to_string().as_str(),
310311
stats.peak_commit.to_string().as_str(),
311312
stats.page_faults.to_string().as_str(),
312-
]).expect("failed to write csv record");
313+
])
314+
.expect("failed to write csv record");
313315
}
314316

315317
const MAX_PRINT_COUNT: usize = 10;

benches/ldbc-graphalytics/pagerank.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ pub fn pagerank(
155155
weights_var
156156
.stream()
157157
.apply2(&initial_weights, |weights, initial_weights| {
158-
if initial_weights.is_empty() {
158+
if !initial_weights.is_empty() {
159159
initial_weights.clone()
160160
} else {
161161
weights.clone()

benches/nexmark/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ fn main() -> Result<()> {
305305
.has_headers(false)
306306
.from_writer(file);
307307
if !results_file_already_exists {
308-
csv_writer.write_record(&[
308+
csv_writer.write_record([
309309
"name",
310310
"num_cores",
311311
"num_events",

src/algebra/floats.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -479,12 +479,12 @@ mod tests {
479479
]
480480
.into_iter()
481481
{
482-
let _length =
483-
bincode::encode_into_slice(&input, &mut slice, bincode::config::standard())
484-
.unwrap();
485-
let decoded: F64 = bincode::decode_from_slice(&slice, bincode::config::standard())
486-
.unwrap()
487-
.0;
482+
let length =
483+
bincode::encode_into_slice(input, &mut slice, bincode::config::standard()).unwrap();
484+
let decoded: F64 =
485+
bincode::decode_from_slice(&slice[..length], bincode::config::standard())
486+
.unwrap()
487+
.0;
488488
assert_eq!(decoded, input);
489489
}
490490
}
@@ -503,12 +503,12 @@ mod tests {
503503
]
504504
.into_iter()
505505
{
506-
let _length =
507-
bincode::encode_into_slice(&input, &mut slice, bincode::config::standard())
508-
.unwrap();
509-
let decoded: F32 = bincode::decode_from_slice(&slice, bincode::config::standard())
510-
.unwrap()
511-
.0;
506+
let length =
507+
bincode::encode_into_slice(input, &mut slice, bincode::config::standard()).unwrap();
508+
let decoded: F32 =
509+
bincode::decode_from_slice(&slice[..length], bincode::config::standard())
510+
.unwrap()
511+
.0;
512512
assert_eq!(decoded, input);
513513
}
514514
}

src/nexmark/generator/bids.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ impl<R: Rng> NexmarkGenerator<R> {
118118
fn get_base_url<R: Rng>(rng: &mut R) -> ArcStr {
119119
arcstr::format!(
120120
"https://www.nexmark.com/{}/item.htm?query=1",
121-
next_string(rng, BASE_URL_PATH_LENGTH)
121+
next_string(rng, BASE_URL_PATH_LENGTH),
122122
)
123123
}
124124

@@ -153,12 +153,10 @@ pub mod tests {
153153
auction: expected_auction_id,
154154
bidder: expected_bidder_id,
155155
price: 100,
156-
channel: "Google".to_string().into(),
157-
url: "https://www.nexmark.com/googl/item.htm?query=1"
158-
.to_string()
159-
.into(),
156+
channel: arcstr::literal!("Google"),
157+
url: arcstr::literal!("https://www.nexmark.com/googl/item.htm?query=1"),
160158
date_time: 1_000_000_000_000,
161-
extra: (0..expected_size).map(|_| "A").collect::<String>().into(),
159+
extra: "A".repeat(expected_size).into(),
162160
},
163161
bid
164162
);
@@ -189,8 +187,13 @@ pub mod tests {
189187
#[test]
190188
fn test_get_new_channel_instance_cached() {
191189
let mut ng = make_test_generator();
192-
ng.bid_channel_cache
193-
.cache_set(1234, ("Google".into(), "https://google.example.com".into()));
190+
ng.bid_channel_cache.cache_set(
191+
1234,
192+
(
193+
arcstr::literal!("Google"),
194+
arcstr::literal!("https://google.example.com"),
195+
),
196+
);
194197

195198
let (channel_name, channel_url) = ng.get_new_channel_instance(1234);
196199

0 commit comments

Comments
 (0)