From 0111079153c16fb98c7da417f298143747cd9e2c Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 28 Sep 2021 16:57:13 +0100 Subject: [PATCH] Make the auto compressor uploadable to pypi (#75) --- Cargo.lock | 42 +++++----- Cargo.toml | 2 +- README.md | 81 ++++++++++--------- compressor_integration_tests/Cargo.toml | 4 +- compressor_integration_tests/src/lib.rs | 8 +- .../tests/auto_compressor_manager_tests.rs | 8 +- .../auto_compressor_state_saving_tests.rs | 6 +- docs/python.md | 8 +- src/lib.rs | 2 +- .../Cargo.toml | 12 ++- synapse_auto_compressor/README.md | 12 +++ .../src/lib.rs | 15 ++-- .../src/main.rs | 16 ++-- .../src/manager.rs | 0 .../src/state_saving.rs | 0 15 files changed, 118 insertions(+), 98 deletions(-) rename {auto_compressor => synapse_auto_compressor}/Cargo.toml (65%) create mode 100644 synapse_auto_compressor/README.md rename {auto_compressor => synapse_auto_compressor}/src/lib.rs (89%) rename {auto_compressor => synapse_auto_compressor}/src/main.rs (93%) rename {auto_compressor => synapse_auto_compressor}/src/manager.rs (100%) rename {auto_compressor => synapse_auto_compressor}/src/state_saving.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 0f55e8d..091fbd5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -54,26 +54,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "auto_compressor" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap", - "env_logger", - "jemallocator", - "log", - "log-panics", - "openssl", - "postgres", - "postgres-openssl", - "pyo3", - "pyo3-log", - "rand", - "serial_test", - "synapse_compress_state", -] - [[package]] name = "autocfg" version = "1.0.1" @@ -144,7 +124,6 @@ dependencies = [ name = "compressor_integration_tests" version = "0.1.0" dependencies = [ - "auto_compressor", "env_logger", "log", "openssl", @@ -154,6 +133,7 @@ dependencies = [ "serial_test", "state-map", "string_cache", + "synapse_auto_compressor", "synapse_compress_state", ] @@ -1122,6 +1102,26 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "synapse_auto_compressor" +version = "0.1.1" +dependencies = [ + "anyhow", + "clap", + "env_logger", + "jemallocator", + "log", + "log-panics", + "openssl", + "postgres", + "postgres-openssl", + "pyo3", + "pyo3-log", + "rand", + "serial_test", + "synapse_compress_state", +] + [[package]] name = "synapse_compress_state" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index bfa9ec7..9c867c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["auto_compressor", "compressor_integration_tests"] +members = ["synapse_auto_compressor", "compressor_integration_tests"] [package] authors = ["Erik Johnston"] diff --git a/README.md b/README.md index 60995dd..2fdebf2 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This workspace contains experimental tools that attempt to reduce the number of rows in the `state_groups_state` table inside of a Synapse Postgresql database. -# Automated tool: auto_compressor +# Automated tool: synapse_auto_compressor ## Introduction: @@ -11,7 +11,7 @@ This tool is significantly more simple to use than the manual tool (described be It scans through all of the rows in the `state_groups` database table from the start. When it finds a group that hasn't been compressed, it runs the compressor for a while on that group's room, saving where it got up to. After compressing a number of these chunks it stops, -saving where it got up to for the next run of the `auto_compressor`. +saving where it got up to for the next run of the `synapse_auto_compressor`. It creates three extra tables in the database: `state_compressor_state` which stores the information needed to stop and start the compressor for each room, `state_compressor_progress` @@ -21,41 +21,42 @@ which stores how far through the `state_groups` table the compressor has scanned The tool can be run manually when you are running out of space, or be scheduled to run periodically. -## Building +## Building This tool requires `cargo` to be installed. See https://www.rust-lang.org/tools/install for instructions on how to do this. -To build `auto_compressor`, clone this repository and navigate to the `autocompressor/` -subdirectory. Then execute `cargo build`. +To build `synapse_auto_compressor`, clone this repository and navigate to the +`synapse_auto_compressor/` subdirectory. Then execute `cargo build`. -This will create an executable and store it in `auto_compressor/target/debug/auto_compressor`. +This will create an executable and store it in +`synapse_auto_compressor/target/debug/synapse_auto_compressor`. ## Example usage ``` -$ auto_compressor -p postgresql://user:pass@localhost/synapse -c 500 -n 100 +$ synapse_auto_compressor -p postgresql://user:pass@localhost/synapse -c 500 -n 100 ``` ## Running Options -- -p [POSTGRES_LOCATION] **Required** +- -p [POSTGRES_LOCATION] **Required** The configuration for connecting to the Postgres database. This should be of the form `"postgresql://username:password@mydomain.com/database"` or a key-value pair string: `"user=username password=password dbname=database host=mydomain.com"` See https://docs.rs/tokio-postgres/0.7.2/tokio_postgres/config/struct.Config.html for the full details. -- -c [CHUNK_SIZE] **Required** +- -c [CHUNK_SIZE] **Required** The number of state groups to work on at once. All of the entries from state_groups_state are requested from the database for state groups that are worked on. Therefore small chunk sizes may be needed on machines with low memory. Note: if the compressor fails to find space savings on the chunk as a whole (which may well happen in rooms with lots of backfill in) then the entire chunk is skipped. -- -n [CHUNKS_TO_COMPRESS] **Required** +- -n [CHUNKS_TO_COMPRESS] **Required** *CHUNKS_TO_COMPRESS* chunks of size *CHUNK_SIZE* will be compressed. The higher this number is set to, the longer the compressor will run for. -- -d [LEVELS] +- -d [LEVELS] Sizes of each new level in the compression algorithm, as a comma-separated list. The first entry in the list is for the lowest, most granular level, with each subsequent entry being for the next highest level. The number of entries in the @@ -67,14 +68,14 @@ given set of state. [defaults to "100,50,25"] ## Scheduling the compressor The automatic tool may put some strain on the database, so it might be best to schedule it to run at a quiet time for the server. This could be done by creating an executable -script and scheduling it with something like +script and scheduling it with something like [cron](https://www.man7.org/linux/man-pages/man1/crontab.1.html). # Manual tool: synapse_compress_state ## Introduction -A manual tool that reads in the rows from `state_groups_state` and `state_group_edges` +A manual tool that reads in the rows from `state_groups_state` and `state_group_edges` tables for a specified room and calculates the changes that could be made that (hopefully) will significantly reduce the number of rows. @@ -85,7 +86,7 @@ that if `-t` is given then each change to a particular state group is wrapped in a transaction). If you do wish to send the changes to the database automatically then the `-c` flag can be set. -The SQL generated is safe to apply against the database with Synapse running. +The SQL generated is safe to apply against the database with Synapse running. This is because the `state_groups` and `state_groups_state` tables are append-only: once written to the database, they are never modified. There is therefore no danger of a modification racing against a running Synapse. Further, this script makes its @@ -95,7 +96,7 @@ from any of the queries that Synapse performs. The tool will also ensure that the generated state deltas do give the same state as the existing state deltas before generating any SQL. -## Building +## Building This tool requires `cargo` to be installed. See https://www.rust-lang.org/tools/install for instructions on how to do this. @@ -125,54 +126,54 @@ $ psql synapse < out.data ## Running Options -- -p [POSTGRES_LOCATION] **Required** +- -p [POSTGRES_LOCATION] **Required** The configuration for connecting to the Postgres database. This should be of the form `"postgresql://username:password@mydomain.com/database"` or a key-value pair string: `"user=username password=password dbname=database host=mydomain.com"` See https://docs.rs/tokio-postgres/0.7.2/tokio_postgres/config/struct.Config.html for the full details. -- -r [ROOM_ID] **Required** +- -r [ROOM_ID] **Required** The room to process (this is the value found in the `rooms` table of the database not the common name for the room - it should look like: "!wOlkWNmgkAZFxbTaqj:matrix.org". -- -b [MIN_STATE_GROUP] +- -b [MIN_STATE_GROUP] The state group to start processing from (non-inclusive). -- -n [GROUPS_TO_COMPRESS] +- -n [GROUPS_TO_COMPRESS] How many groups to load into memory to compress (starting from the 1st group in the room or the group specified by -b). -- -l [LEVELS] +- -l [LEVELS] Sizes of each new level in the compression algorithm, as a comma-separated list. -The first entry in the list is for the lowest, most granular level, with each +The first entry in the list is for the lowest, most granular level, with each subsequent entry being for the next highest level. The number of entries in the list determines the number of levels that will be used. The sum of the sizes of the levels affects the performance of fetching the state from the database, as the sum of the sizes is the upper bound on the number of iterations needed to fetch a given set of state. [defaults to "100,50,25"] -- -m [COUNT] +- -m [COUNT] If the compressor cannot save this many rows from the database then it will stop early. -- -s [MAX_STATE_GROUP] +- -s [MAX_STATE_GROUP] If a max_state_group is specified then only state groups with id's lower than this number can be compressed. -- -o [FILE] +- -o [FILE] File to output the SQL transactions to (for later running on the database). -- -t +- -t If this flag is set then each change to a particular state group is wrapped in a transaction. This should be done if you wish to apply the changes while synapse is still running. -- -c +- -c If this flag is set then the changes the compressor makes will be committed to the database. This should be safe to use while synapse is running as it wraps the changes to every state group in it's own transaction (as if the transaction flag was set). -- -g +- -g If this flag is set then output the node and edge information for the state_group directed graph built up from the predecessor state_group links. These can be looked at in something like Gephi (https://gephi.org). @@ -196,10 +197,10 @@ $ docker-compose down # Using the synapse_compress_state library If you want to use the compressor in another project, it is recomended that you -use jemalloc `https://github.com/gnzlbg/jemallocator`. +use jemalloc `https://github.com/gnzlbg/jemallocator`. To prevent the progress bars from being shown, use the `no-progress-bars` feature. -(See `auto_compressor/Cargo.toml` for an example) +(See `synapse_auto_compressor/Cargo.toml` for an example) # Troubleshooting @@ -216,29 +217,29 @@ from the machine where Postgres is running, the url will be the following: ### From remote machine If you wish to connect from a different machine, you'll need to edit your Postgres settings to allow -remote connections. This requires updating the +remote connections. This requires updating the [`pg_hba.conf`](https://www.postgresql.org/docs/current/auth-pg-hba-conf.html) and the `listen_addresses` setting in [`postgresql.conf`](https://www.postgresql.org/docs/current/runtime-config-connection.html) ## Printing debugging logs -The amount of output the tools produce can be altered by setting the RUST_LOG -environment variable to something. +The amount of output the tools produce can be altered by setting the RUST_LOG +environment variable to something. -To get more logs when running the auto_compressor tool try the following: +To get more logs when running the synapse_auto_compressor tool try the following: ``` -$ RUST_LOG=debug auto_compressor -p postgresql://user:pass@localhost/synapse -c 50 -n 100 +$ RUST_LOG=debug synapse_auto_compressor -p postgresql://user:pass@localhost/synapse -c 50 -n 100 ``` -If you want to suppress all the debugging info you are getting from the +If you want to suppress all the debugging info you are getting from the Postgres client then try: ``` -RUST_LOG=auto_compressor=debug,synapse_compress_state=debug auto_compressor [etc.] +RUST_LOG=synapse_auto_compressor=debug,synapse_compress_state=debug synapse_auto_compressor [etc.] ``` -This will only print the debugging information from those two packages. For more info see +This will only print the debugging information from those two packages. For more info see https://docs.rs/env_logger/0.9.0/env_logger/. ## Building difficulties @@ -248,7 +249,7 @@ and building on Linux will also require `pkg-config` This can be done on Ubuntu with: `$ apt-get install libssl-dev pkg-config` -Note that building requires quite a lot of memory and out-of-memory errors might not be +Note that building requires quite a lot of memory and out-of-memory errors might not be obvious. It's recomended you only build these tools on machines with at least 2GB of RAM. ## Auto Compressor skips chunks when running on already compressed room @@ -265,8 +266,8 @@ be a large problem. ## Compressor is trying to increase the number of rows -Backfilling can lead to issues with compression. The auto_compressor will -skip chunks it can't reduce the size of and so this should help jump over the backfilled +Backfilling can lead to issues with compression. The synapse_auto_compressor will +skip chunks it can't reduce the size of and so this should help jump over the backfilled state_groups. Lots of state resolution might also impact the ability to use the compressor. To examine the state_group hierarchy run the manual tool on a room with the `-g` option diff --git a/compressor_integration_tests/Cargo.toml b/compressor_integration_tests/Cargo.toml index a03fe2a..e3142a0 100644 --- a/compressor_integration_tests/Cargo.toml +++ b/compressor_integration_tests/Cargo.toml @@ -13,9 +13,9 @@ postgres = "0.19.0" postgres-openssl = "0.5.0" rand = "0.8.0" synapse_compress_state = { path = "../", features = ["no-progress-bars"] } -auto_compressor = { path = "../auto_compressor/" } +synapse_auto_compressor = { path = "../synapse_auto_compressor/" } env_logger = "0.9.0" log = "0.4.14" [dependencies.state-map] -git = "https://github.com/matrix-org/rust-matrix-state-map" \ No newline at end of file +git = "https://github.com/matrix-org/rust-matrix-state-map" diff --git a/compressor_integration_tests/src/lib.rs b/compressor_integration_tests/src/lib.rs index a02bb39..cea0d5e 100644 --- a/compressor_integration_tests/src/lib.rs +++ b/compressor_integration_tests/src/lib.rs @@ -179,7 +179,7 @@ fn collapse_state_with_database(state_group: i64) -> StateMap { // the predecessor (so have split this into a different query) let query_pred = r#" SELECT prev_state_group - FROM state_group_edges + FROM state_group_edges WHERE state_group = $1 "#; @@ -243,7 +243,7 @@ pub fn database_structure_matches_map(state_group_map: &BTreeMap PyResult<()> { let _ = pyo3_log::Logger::default() // don't send out anything lower than a warning from other crates .filter(LevelFilter::Warn) - // don't log warnings from synapse_compress_state, the auto_compressor handles these + // don't log warnings from synapse_compress_state, the synapse_auto_compressor handles these // situations and provides better log messages .filter_target("synapse_compress_state".to_owned(), LevelFilter::Debug) .install(); diff --git a/auto_compressor/Cargo.toml b/synapse_auto_compressor/Cargo.toml similarity index 65% rename from auto_compressor/Cargo.toml rename to synapse_auto_compressor/Cargo.toml index c250183..a38f17a 100644 --- a/auto_compressor/Cargo.toml +++ b/synapse_auto_compressor/Cargo.toml @@ -1,10 +1,16 @@ [package] -name = "auto_compressor" +name = "synapse_auto_compressor" authors = ["William Ashton"] -version = "0.1.0" +version = "0.1.1" edition = "2018" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[package.metadata.maturin] +requires-python = ">=3.6" +project-url = {Source = "https://github.com/matrix-org/rust-synapse-compress-state"} +classifier = [ + "Development Status :: 4 - Beta", + "Programming Language :: Rust", +] [dependencies] clap = "2.33.0" diff --git a/synapse_auto_compressor/README.md b/synapse_auto_compressor/README.md new file mode 100644 index 0000000..41f2c57 --- /dev/null +++ b/synapse_auto_compressor/README.md @@ -0,0 +1,12 @@ +# Auto Compressor + +See the top level readme for information. + + +## Publishing to PyPI + +Bump the version number and run from the root directory of the repo: + +``` +docker run -it --rm -v $(pwd):/io -e OPENSSL_STATIC=1 konstin2/maturin publish -m synapse_auto_compressor/Cargo.toml --cargo-extra-args "\--features='openssl/vendored'" +``` diff --git a/auto_compressor/src/lib.rs b/synapse_auto_compressor/src/lib.rs similarity index 89% rename from auto_compressor/src/lib.rs rename to synapse_auto_compressor/src/lib.rs index cfa3451..aa4568f 100644 --- a/auto_compressor/src/lib.rs +++ b/synapse_auto_compressor/src/lib.rs @@ -57,15 +57,16 @@ impl FromStr for LevelInfo { // PyO3 INTERFACE STARTS HERE #[pymodule] -fn auto_compressor(_py: Python, m: &PyModule) -> PyResult<()> { +fn synapse_auto_compressor(_py: Python, m: &PyModule) -> PyResult<()> { let _ = pyo3_log::Logger::default() // don't send out anything lower than a warning from other crates .filter(LevelFilter::Warn) - // don't log warnings from synapse_compress_state, the auto_compressor handles these - // situations and provides better log messages + // don't log warnings from synapse_compress_state, the + // synapse_auto_compressor handles these situations and provides better + // log messages .filter_target("synapse_compress_state".to_owned(), LevelFilter::Error) - // log info and above for the auto_compressor - .filter_target("auto_compressor".to_owned(), LevelFilter::Debug) + // log info and above for the synapse_auto_compressor + .filter_target("synapse_auto_compressor".to_owned(), LevelFilter::Debug) .install(); // ensure any panics produce error messages in the log log_panics::init(); @@ -92,7 +93,7 @@ fn auto_compressor(_py: Python, m: &PyModule) -> PyResult<()> { number_of_chunks: i64, ) -> PyResult<()> { // Announce the start of the program to the logs - log::info!("auto_compressor started"); + log::info!("synapse_auto_compressor started"); // Parse the default_level string into a LevelInfo struct let default_levels: LevelInfo = match default_levels.parse() { @@ -120,7 +121,7 @@ fn auto_compressor(_py: Python, m: &PyModule) -> PyResult<()> { return Err(PyErr::new::(format!("{:?}", e))); } - log::info!("auto_compressor finished"); + log::info!("synapse_auto_compressor finished"); Ok(()) } Ok(()) diff --git a/auto_compressor/src/main.rs b/synapse_auto_compressor/src/main.rs similarity index 93% rename from auto_compressor/src/main.rs rename to synapse_auto_compressor/src/main.rs index 4cb06e2..215f0ec 100644 --- a/auto_compressor/src/main.rs +++ b/synapse_auto_compressor/src/main.rs @@ -19,20 +19,20 @@ #[global_allocator] static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; -use auto_compressor::{manager, state_saving, LevelInfo}; use clap::{crate_authors, crate_description, crate_name, crate_version, value_t, App, Arg}; use log::LevelFilter; use std::{env, fs::OpenOptions}; +use synapse_auto_compressor::{manager, state_saving, LevelInfo}; /// Execution starts here fn main() { - // setup the logger for the auto_compressor + // setup the logger for the synapse_auto_compressor // The default can be overwritten with RUST_LOG // see the README for more information let log_file = OpenOptions::new() .append(true) .create(true) - .open("auto_compressor.log") + .open("synapse_auto_compressor.log") .unwrap_or_else(|e| panic!("Error occured while opening the log file: {}", e)); if env::var("RUST_LOG").is_err() { @@ -41,8 +41,8 @@ fn main() { log_builder.filter_module("panic", LevelFilter::Error); // Only output errors from the synapse_compress state library log_builder.filter_module("synapse_compress_state", LevelFilter::Error); - // Output log levels info and above from auto_compressor - log_builder.filter_module("auto_compressor", LevelFilter::Info); + // Output log levels info and above from synapse_auto_compressor + log_builder.filter_module("synapse_auto_compressor", LevelFilter::Info); log_builder.init(); } else { // If RUST_LOG was set then use that @@ -54,7 +54,7 @@ fn main() { } log_panics::init(); // Announce the start of the program to the logs - log::info!("auto_compressor started"); + log::info!("synapse_auto_compressor started"); // parse the command line arguments using the clap crate let arguments = App::new(crate_name!()) @@ -113,7 +113,7 @@ fn main() { Arg::with_name("number_of_chunks") .short("n") .value_name("CHUNKS_TO_COMPRESS") - .help("The number of chunks to compress") + .help("The number of chunks to compress") .long_help(concat!( "This many chunks of the database will be compressed. The higher this number is set to, ", "the longer the compressor will run for." @@ -155,5 +155,5 @@ fn main() { manager::compress_chunks_of_database(db_url, chunk_size, &default_levels.0, number_of_chunks) .unwrap(); - log::info!("auto_compressor finished"); + log::info!("synapse_auto_compressor finished"); } diff --git a/auto_compressor/src/manager.rs b/synapse_auto_compressor/src/manager.rs similarity index 100% rename from auto_compressor/src/manager.rs rename to synapse_auto_compressor/src/manager.rs diff --git a/auto_compressor/src/state_saving.rs b/synapse_auto_compressor/src/state_saving.rs similarity index 100% rename from auto_compressor/src/state_saving.rs rename to synapse_auto_compressor/src/state_saving.rs