From 096467853da3df39bc88cbc6676a6aed8cd54fa7 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 18 Sep 2018 15:40:58 +0100 Subject: [PATCH] Improve algorithm and documentation --- Cargo.lock | 278 +++++++++++++++++++++++++++++-- Cargo.toml | 4 +- README.md | 41 +++-- src/compressor.rs | 255 +++++++++++++++++++++++++++++ src/database.rs | 125 ++++++++++++++ src/main.rs | 408 ++++++++++++++++------------------------------ 6 files changed, 821 insertions(+), 290 deletions(-) create mode 100644 src/compressor.rs create mode 100644 src/database.rs diff --git a/Cargo.lock b/Cargo.lock index fb34550..74c3864 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,3 +1,11 @@ +[[package]] +name = "aho-corasick" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ansi_term" version = "0.11.0" @@ -135,6 +143,41 @@ dependencies = [ "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "clicolors-control" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "console" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "clicolors-control 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "termios 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "constant_time_eq" version = "0.1.3" @@ -259,6 +302,17 @@ dependencies = [ "digest 0.7.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "indicatif" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "console 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "iovec" version = "0.1.2" @@ -273,6 +327,20 @@ name = "itoa" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lazy_static" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "lazy_static" version = "1.1.0" @@ -295,6 +363,15 @@ dependencies = [ "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "lock_api" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "owning_ref 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "log" version = "0.4.5" @@ -321,6 +398,14 @@ dependencies = [ "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "memchr" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "memoffset" version = "0.2.1" @@ -352,6 +437,35 @@ dependencies = [ "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "owning_ref" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lock_api 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "parking_lot_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "phf" version = "0.7.23" @@ -451,6 +565,23 @@ dependencies = [ "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rand" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "rayon" version = "1.0.2" @@ -485,6 +616,46 @@ dependencies = [ "redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "regex" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rust-matrix-lib" version = "0.1.0" @@ -502,22 +673,19 @@ dependencies = [ "sodiumoxide 0.0.16 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rust-synapse-compress-state" -version = "0.1.0" -dependencies = [ - "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", - "fallible-iterator 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "postgres 0.15.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rayon 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rust-matrix-lib 0.1.0 (git+https://github.com/erikjohnston/rust-matrix-lib)", -] - [[package]] name = "rustc-demangle" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ryu" version = "0.2.6" @@ -533,6 +701,19 @@ name = "scopeguard" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "serde" version = "1.0.78" @@ -608,6 +789,11 @@ dependencies = [ "serde 1.0.78 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "stable_deref_trait" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "stringprep" version = "0.1.2" @@ -642,6 +828,18 @@ dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "synapse-compress-state" +version = "0.1.0" +dependencies = [ + "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fallible-iterator 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "indicatif 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", + "postgres 0.15.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rust-matrix-lib 0.1.0 (git+https://github.com/erikjohnston/rust-matrix-lib)", +] + [[package]] name = "synstructure" version = "0.9.0" @@ -663,6 +861,14 @@ dependencies = [ "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "termios" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "textwrap" version = "0.10.0" @@ -671,6 +877,14 @@ dependencies = [ "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "time" version = "0.1.40" @@ -686,6 +900,11 @@ name = "typenum" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "ucd-util" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "unicode-bidi" version = "0.3.4" @@ -717,6 +936,11 @@ dependencies = [ "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "utf8-ranges" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "vec_map" version = "0.8.1" @@ -746,6 +970,11 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" @@ -757,6 +986,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] +"checksum aho-corasick 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)" = "68f56c7353e5a9547cbd76ed90f7bb5ffc3ba09d4ea9bd1d8c06c8b1142eeb5a" "checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0d382e583f07208808f6b1249e60848879ba3543f57c32277bf52d69c2f0f0ee" "checksum arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a1e964f9e24d588183fcb43503abda40d288c8657dfc27311516ce2f05675aef" @@ -774,6 +1004,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum cfg-if 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4e7bb64a8ebb0d856483e1e682ea3422f883c5f5615a90d51a2c82fe87fdd3" "checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" +"checksum clicolors-control 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1f84dec9bc083ce2503908cd305af98bd363da6f54bf8d4bf0ac14ee749ad5d1" +"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum console 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "ecd48adf136733979b49e15bc3b4c43cc0d3c85ece7bd08e6daa414c6fcb13e6" "checksum constant_time_eq 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ff012e225ce166d4422e0e78419d901719760f62ae2b7969ca6b564d1b54a9e" "checksum crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f739f8c5363aca78cfb059edf753d8f0d36908c348f3d8d1503f03d8b75d9cf3" "checksum crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "927121f5407de9956180ff5e936fe3cf4324279280001cd56b669d28ee7e9150" @@ -790,20 +1023,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum generic-array 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ef25c5683767570c2bbd7deba372926a55eaae9982d7726ee2a1050239d45b9d" "checksum hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d6a22814455d41612f41161581c2883c0c6a1c41852729b17d5ed88f01e153aa" "checksum hmac 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44f3bdb08579d99d7dc761c0e266f13b5f2ab8c8c703b9fc9ef333cd8f48f55e" +"checksum indicatif 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a29b2fa6f00010c268bface64c18bb0310aaa70d46a195d5382d288c477fb016" "checksum iovec 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "dbe6e417e7d0975db6512b90796e8ce223145ac4e33c377e4a42882a0e88bb08" "checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" +"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +"checksum lazy_static 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "76f033c7ad61445c5b347c7382dd1237847eb1bce590fe50365dcb33d546be73" "checksum lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca488b89a5657b0a2ecd45b95609b3e848cf1755da332a0da46e2b2b1cb371a7" "checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d" "checksum libsodium-sys 0.0.16 (registry+https://github.com/rust-lang/crates.io-index)" = "fcbd1beeed8d44caa8a669ebaa697c313976e242c03cc9fb23d88bf1656f5542" +"checksum lock_api 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "949826a5ccf18c1b3a7c3d57692778d21768b79e46eb9dd07bfc4c2160036c54" "checksum log 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d4fcce5fa49cc693c312001daf1d13411c4a5283796bac1084299ea3e567113f" "checksum matches 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" "checksum md5 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "79c56d6a0b07f9e19282511c83fc5b086364cbae4ba8c7d5f190c3d9b0425a48" "checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a" +"checksum memchr 2.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a3b4142ab8738a78c51896f704f83c11df047ff1bda9a92a661aa6361552d93d" "checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" "checksum nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "9a2228dca57108069a5262f2ed8bd2e82496d2e074a06d1ccc7ce1687b6ae0a2" "checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" "checksum num-traits 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "630de1ef5cc79d0cdd78b7e33b81f083cbfe90de0f4b2b2f07f905867c70e9fe" "checksum num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c51a3322e4bca9d212ad9a158a02abc6934d005490c054a2778df73a70aa0a30" +"checksum owning_ref 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "cdf84f41639e037b484f93433aa3897863b561ed65c6e59c7073d7c561710f37" +"checksum parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0802bff09003b291ba756dc7e79313e51cc31667e94afbe847def490424cde5" +"checksum parking_lot_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad7f7e6ebdc79edff6fdcb87a55b620174f7a989e3eb31b65231f4af57f00b8c" "checksum phf 0.7.23 (registry+https://github.com/rust-lang/crates.io-index)" = "cec29da322b242f4c3098852c77a0ca261c9c01b806cae85a5572a1eb94db9a6" "checksum phf_shared 0.7.23 (registry+https://github.com/rust-lang/crates.io-index)" = "b539898d22d4273ded07f64a05737649dc69095d92cb87c7097ec68e3f150b93" "checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" @@ -814,15 +1055,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum quote 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)" = "dd636425967c33af890042c483632d33fa7a18f19ad1d7ea72e8998c6ef8dea5" "checksum rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "15a732abf9d20f0ad8eeb6f909bf6868722d9a06e1e50802b6a70351f40b4eb1" "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" +"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" +"checksum rand_core 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "edecf0f94da5551fc9b492093e30b041a891657db7940ee221f9d2f66e82eef2" "checksum rayon 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "df7a791f788cb4c516f0e091301a29c2b71ef680db5e644a7d68835c8ae6dbfa" "checksum rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b055d1e92aba6877574d8fe604a63c8b5df60f60e5982bf7ccbb1338ea527356" "checksum redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "c214e91d3ecf43e9a4e41e578973adeb14b474f2bee858742d127af75a0112b1" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +"checksum regex 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9329abc99e39129fcceabd24cf5d85b4671ef7c29c50e972bc5afe32438ec384" +"checksum regex 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "2069749032ea3ec200ca51e4a31df41759190a88edca0d2d86ee8bedf7073341" +"checksum regex-syntax 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7d707a4fa2637f2dca2ef9fd02225ec7661fe01a53623c1e6515b6916511f7a7" +"checksum regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "747ba3b235651f6e2f67dfa8bcdcd073ddb7c243cb21c442fc12395dfcac212d" "checksum rust-matrix-lib 0.1.0 (git+https://github.com/erikjohnston/rust-matrix-lib)" = "" "checksum rustc-demangle 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bcfe5b13211b4d78e5c2cadfebd7769197d95c639c35a50057eb4c05de811395" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" "checksum ryu 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7153dd96dade874ab973e098cb62fcdbb89a03682e46b144fd09550998d4a4a7" "checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f" "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum serde 1.0.78 (registry+https://github.com/rust-lang/crates.io-index)" = "92ec94e2754699adddbbc4f555791bd3acc2a2f5574cba16c93a4a9cf4a04415" "checksum serde_derive 1.0.78 (registry+https://github.com/rust-lang/crates.io-index)" = "0fb622d85245add5327d4f08b2d24fd51fa5d35fe1bba19ee79a1f211e9ac0ff" "checksum serde_json 1.0.27 (registry+https://github.com/rust-lang/crates.io-index)" = "59790990c5115d16027f00913e2e66de23a51f70422e549d2ad68c8c5f268f1c" @@ -832,24 +1082,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum smallvec 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "153ffa32fd170e9944f7e0838edf824a754ec4c1fc64746fcc9fe1f8fa602e5d" "checksum socket2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c4d11a52082057d87cb5caa31ad812f4504b97ab44732cd8359df2e9ff9f48e7" "checksum sodiumoxide 0.0.16 (registry+https://github.com/rust-lang/crates.io-index)" = "eb5cb2f14f9a51352ad65e59257a0a9459d5a36a3615f3d53a974c82fdaaa00a" +"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" "checksum stringprep 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" "checksum syn 0.14.9 (registry+https://github.com/rust-lang/crates.io-index)" = "261ae9ecaa397c42b960649561949d69311f08eeaea86a65696e6e46517cf741" "checksum syn 0.15.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e5c1514eb7bb4216fc722b3cd08783d326d7de0d62f6d5e48a774f610bc97cb6" "checksum synstructure 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "85bb9b7550d063ea184027c9b8c20ac167cd36d3e06b3a40bceb9d746dc1a7b7" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" +"checksum termios 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "70226acdf12d182df757d9fb07c0257a1558ec48c8059f607d6b38145ce4e2fa" "checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum time 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "d825be0eb33fda1a7e68012d51e9c7f451dc1a69391e7fdc197060bb8c56667b" "checksum typenum 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169" +"checksum ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "fd2be2d6639d0f8fe6cdda291ad456e23629558d466e2789d2c3e9892bda285d" "checksum unicode-bidi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" "checksum unicode-normalization 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +"checksum utf8-ranges 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "fd70f467df6810094968e2fce0ee1bd0e87157aceb026a8c083bcf5e25b9efe4" "checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" +"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml index b4d63c6..c60e850 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,13 @@ [package] authors = ["Erik Johnston"] -name = "rust-synapse-compress-state" +name = "synapse-compress-state" version = "0.1.0" +description = "A tool to compress some state in a Synapse instance's database" [dependencies] clap = "2.32.0" fallible-iterator = "0.1.5" +indicatif = "0.9.0" postgres = "0.15.2" rayon = "1.0.2" diff --git a/README.md b/README.md index 163e0c3..be03098 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,43 @@ An experimental tool that reads in the rows from `state_groups_state` and `state_group_edges` tables for a particular room and calculates the changes that -could be made that (hopefully) will signifcantly reduce the number of rows. +could be made that (hopefully) will significantly reduce the number of rows. This tool currently *does not* write to the database in any way, so should be -safe to run. +safe to run. If the `-o` option is specified then SQL will be written to the +given file that would change the tables to match the calculated state. (Note +that if `-t` is given then each change to a particular state group is wrapped +in a transaction) + +## Algorithm + +The algorithm works by attempting to create a tree of deltas, produced by +appending state groups to different "levels". Each level has a maximum size, where +each state group is appended to the lowest level that is not full. + +This produces a graph that looks approximately like the following, in the case +of having two levels with the bottom level (L1) having a maximum size of 3: + +``` +L2 <-------------------- L2 <---------- ... +^--- L1 <--- L1 <--- L1 ^--- L1 <--- L1 <--- L1 +``` + +The sizes and number of levels used can be controlled via `-l`. ## Example ``` -$ cargo run --release -- -p "postgresql://localhost/synapse" -r '!some_room:example.com' - Compiling rust-synapse-compress-state v0.1.0 (file:///home/erikj/git/rust-synapse-compress-state) - Finished release [optimized] target(s) in 2.39s - Running `target/release/rust-synapse-compress-state -p 'postgresql://localhost/synapse' -r '!some_room:example.com'` -Missing 11 state groups -Number of entries: 25694 -Number of rows: 356650 -Number of rows compressed: 41068 +$ synapse-compress-state -p "postgresql://localhost/synapse" -r '!some_room:example.com' +Fetching state from DB for room '!some_room:example.com'... +Got initial state from database. Checking for any missing state groups... +Number of state groups: 73904 +Number of rows in current table: 2240043 +Number of rows after compression: 165754 (7.40%) +Compression Statistics: + Number of forced resets due to lacking prev: 34 + Number of compressed rows caused by the above: 17092 + Number of state groups changed: 2748 New state map matches old one ``` diff --git a/src/compressor.rs b/src/compressor.rs new file mode 100644 index 0000000..57c5df1 --- /dev/null +++ b/src/compressor.rs @@ -0,0 +1,255 @@ +//! This is the actual compression algorithm. +//! +//! The algorithm attempts to make a tree of deltas for the state group maps. +//! This is done by having multiple "levels", where each level has a maximum +//! size. The state groups are iterated over, with deltas being calculated +//! against the smallest level that isn't yet full. When a state group is +//! inserted into a level, or lower levels are reset to have their current +//! "head" at the new state group. +//! +//! This produces graphs that look roughly like, for two levels: +//! +//! ``` +//! L2 <-------------------- L2 <---------- ... +//! ^--- L1 <--- L1 <--- L1 ^--- L1 <--- L1 <--- L1 +//! ``` + + +use rust_matrix_lib::state_map::StateMap; + +use std::collections::BTreeMap; + +use {collapse_state_maps, StateGroupEntry}; + + +/// Holds information about a particular level. +struct Level { + /// The maximum size this level is allowed to be + max_length: usize, + /// The (approximate) current chain length of this level. This is equivalent + /// to recursively following `current` + current_chain_length: usize, + /// The head of this level + current: Option, +} + +impl Level { + /// Creates a new Level with the given maximum length + pub fn new(max_length: usize) -> Level { + Level { + max_length, + current_chain_length: 0, + current: None, + } + } + + /// Update the current head of this level. If delta is true then it means + /// that given state group will (probably) reference the previous head. + /// + /// Panics if `delta` is true and the level is already full. + pub fn update(&mut self, current: i64, delta: bool) { + self.current = Some(current); + + if delta { + // If we're referencing the previous head then increment our chain + // length estimate + if !self.has_space() { + panic!("Tried to add to a already full level"); + } + + self.current_chain_length += 1; + } else { + // Otherwise, we've started a new chain with a single entry. + self.current_chain_length = 1; + } + } + + /// Get the current head of the level + pub fn get_current(&self) -> Option { + self.current + } + + /// Whether there is space in the current chain at this level. If not then a + /// new chain should be started. + pub fn has_space(&self) -> bool { + self.current_chain_length < self.max_length + } +} + + +/// Keeps track of some statistics of a compression run. +#[derive(Default)] +pub struct Stats { + /// How many state groups we couldn't find a delta for, despite trying. + pub resets_no_suitable_prev: usize, + /// The sum of the rows of the state groups counted by + /// `resets_no_suitable_prev`. + pub resets_no_suitable_prev_size: usize, + /// How many state groups we have changed. + pub state_groups_changed: usize, +} + + +/// Attempts to compress a set of state deltas using the given level sizes. +pub struct Compressor<'a> { + original_state_map: &'a BTreeMap, + pub new_state_group_map: BTreeMap, + levels: Vec, + pub stats: Stats, +} + +impl<'a> Compressor<'a> { + /// Creates a compressor and runs the compression algorithm. + pub fn compress( + original_state_map: &'a BTreeMap, + level_sizes: &[usize], + ) -> Compressor<'a> { + let mut compressor = Compressor { + original_state_map, + new_state_group_map: BTreeMap::new(), + levels: level_sizes.iter().map(|size| Level::new(*size)).collect(), + stats: Stats::default(), + }; + + compressor.create_new_tree(); + + compressor + } + + /// Actually runs the compression algorithm + fn create_new_tree(&mut self) { + if !self.new_state_group_map.is_empty() { + panic!("Can only call `create_new_tree` once"); + } + + for (&state_group, entry) in self.original_state_map { + let mut prev_state_group = None; + for level in &mut self.levels { + if level.has_space() { + prev_state_group = level.get_current(); + level.update(state_group, true); + break; + } else { + level.update(state_group, false); + } + } + + let (delta, prev_state_group) = if entry.prev_state_group == prev_state_group { + (entry.state_map.clone(), prev_state_group) + } else { + self.stats.state_groups_changed += 1; + self.get_delta(prev_state_group, state_group) + }; + + self.new_state_group_map.insert( + state_group, + StateGroupEntry { + prev_state_group, + state_map: delta, + }, + ); + } + } + + /// Attempts to calculate the delta between two state groups. + /// + /// This is not always possible if the given candidate previous state group + /// have state keys that are not in the new state group. In this case the + /// function will try and iterate back up the current tree to find a state + /// group that can be used as a base for a delta. + /// + /// Returns the state map and the actual base state group (if any) used. + fn get_delta(&mut self, prev_sg: Option, sg: i64) -> (StateMap, Option) { + let state_map = collapse_state_maps(&self.original_state_map, sg); + + let mut prev_sg = if let Some(prev_sg) = prev_sg { + prev_sg + } else { + return (state_map, None); + }; + + // This is a loop to go through to find the first prev_sg which can be + // a valid base for the state group. + let mut prev_state_map; + 'outer: loop { + prev_state_map = collapse_state_maps(&self.original_state_map, prev_sg); + for (t, s) in prev_state_map.keys() { + if !state_map.contains_key(t, s) { + // This is not a valid base as it contains key the new state + // group doesn't have. Attempt to walk up the tree to find a + // better base. + if let Some(psg) = self.new_state_group_map[&prev_sg].prev_state_group { + prev_sg = psg; + continue 'outer; + } + + // Couldn't find a new base, so we give up and just persist + // a full state group here. + self.stats.resets_no_suitable_prev += 1; + self.stats.resets_no_suitable_prev_size += state_map.len(); + + return (state_map, None); + } + } + + break; + } + + // We've found a valid base, now we just need to calculate the delta. + let mut delta_map = StateMap::new(); + + for ((t, s), e) in state_map.iter() { + if prev_state_map.get(t, s) != Some(e) { + delta_map.insert(t, s, e.clone()); + } + } + + (delta_map, Some(prev_sg)) + } +} + +#[test] +fn test_new_map() { + let mut initial: BTreeMap = BTreeMap::new(); + + let mut prev = None; + for i in 0i64..=13i64 { + initial.insert( + i, + StateGroupEntry { + prev_state_group: prev, + state_map: StateMap::new(), + }, + ); + + prev = Some(i) + } + + let compressor = Compressor::compress(&initial, &[3, 3]); + + let new_state = compressor.new_state_group_map; + + let expected_edges: BTreeMap = vec![ + (1, 0), + (2, 1), + (4, 3), + (5, 4), + (6, 3), + (7, 6), + (8, 7), + (9, 6), + (10, 9), + (11, 10), + (13, 12), + ].into_iter() + .collect(); + + for sg in 0i64..=13i64 { + assert_eq!( + expected_edges.get(&sg).cloned(), + new_state[&sg].prev_state_group, + "state group {} did not match expected", + sg, + ); + } +} diff --git a/src/database.rs b/src/database.rs new file mode 100644 index 0000000..23a7fe2 --- /dev/null +++ b/src/database.rs @@ -0,0 +1,125 @@ +use fallible_iterator::FallibleIterator; +use indicatif::{ProgressBar, ProgressStyle}; +use postgres::{Connection, TlsMode}; + +use std::collections::BTreeMap; + +use StateGroupEntry; + + +/// Fetch the entries in state_groups_state (and their prev groups) for the +/// given `room_id` by connecting to the postgres database at `db_url`. +pub fn get_data_from_db(db_url: &str, room_id: &str) -> BTreeMap { + let conn = Connection::connect(db_url, TlsMode::None).unwrap(); + + let mut state_group_map = get_initial_data_from_db(&conn, room_id); + + println!("Got initial state from database. Checking for any missing state groups..."); + + // Due to reasons some of the state groups appear in the edges table, but + // not in the state_groups_state table. This means they don't get included + // in our DB queries, so we have to fetch any missing groups explicitly. + // Since the returned groups may themselves reference groups we don't have, + // we need to do this recursively until we don't find any more missing. + loop { + let missing_sgs: Vec<_> = state_group_map + .iter() + .filter_map(|(_sg, entry)| { + if let Some(prev_sg) = entry.prev_state_group { + if state_group_map.contains_key(&prev_sg) { + None + } else { + Some(prev_sg) + } + } else { + None + } + }).collect(); + + if missing_sgs.is_empty() { + break; + } + + println!("Missing {} state groups", missing_sgs.len()); + + let map = get_missing_from_db(&conn, &missing_sgs); + state_group_map.extend(map.into_iter()); + } + + state_group_map +} + +/// Fetch the entries in state_groups_state (and their prev groups) for the +/// given `room_id` by fetching all state with the given `room_id`. +fn get_initial_data_from_db(conn: &Connection, room_id: &str) -> BTreeMap { + let stmt = conn + .prepare( + r#" + SELECT m.id, prev_state_group, type, state_key, s.event_id + FROM state_groups AS m + LEFT JOIN state_groups_state AS s ON (m.id = s.state_group) + LEFT JOIN state_group_edges AS e ON (m.id = e.state_group) + WHERE m.room_id = $1 + "#, + ).unwrap(); + + let trans = conn.transaction().unwrap(); + let mut rows = stmt.lazy_query(&trans, &[&room_id], 1000).unwrap(); + + let mut state_group_map: BTreeMap = BTreeMap::new(); + + let pb = ProgressBar::new_spinner(); + pb.set_style(ProgressStyle::default_spinner().template("{spinner} [{elapsed}] {pos}")); + pb.enable_steady_tick(100); + + let mut num_rows = 0; + while let Some(row) = rows.next().unwrap() { + let state_group = row.get(0); + + let entry = state_group_map.entry(state_group).or_default(); + + entry.prev_state_group = row.get(1); + let etype: Option = row.get(2); + + if let Some(etype) = etype { + entry + .state_map + .insert(&etype, &row.get::<_, String>(3), row.get(4)); + } + + pb.inc(1); + num_rows += 1; + } + + pb.set_length(num_rows); + pb.finish(); + + state_group_map +} + +/// Get any missing state groups from the database +fn get_missing_from_db(conn: &Connection, missing_sgs: &[i64]) -> BTreeMap { + let stmt = conn + .prepare( + r#" + SELECT state_group, prev_state_group + FROM state_group_edges + WHERE state_group = ANY($1) + "#, + ).unwrap(); + let trans = conn.transaction().unwrap(); + + let mut rows = stmt.lazy_query(&trans, &[&missing_sgs], 100).unwrap(); + + let mut state_group_map: BTreeMap = BTreeMap::new(); + + while let Some(row) = rows.next().unwrap() { + let state_group = row.get(0); + + let entry = state_group_map.entry(state_group).or_default(); + + entry.prev_state_group = row.get(1); + } + + state_group_map +} diff --git a/src/main.rs b/src/main.rs index c6dd089..99733e6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,149 +1,42 @@ +//! This is a tool that attempts to further compress state maps within a +//! Synapse instance's database. Specifically, it aims to reduce the number of +//! rows that a given room takes up in the `state_groups_state` table. + #[macro_use] extern crate clap; extern crate fallible_iterator; +extern crate indicatif; extern crate postgres; extern crate rayon; extern crate rust_matrix_lib; -use clap::{App, Arg, ArgGroup}; -use fallible_iterator::FallibleIterator; -use postgres::{Connection, TlsMode}; +mod compressor; +mod database; + +use compressor::Compressor; + +use clap::{App, Arg}; use rayon::prelude::*; use rust_matrix_lib::state_map::StateMap; use std::collections::BTreeMap; use std::fs::File; -use std::io::{BufRead, BufReader, Write}; +use std::io::Write; +use std::str::FromStr; /// An entry for a state group. Consists of an (optional) previous group and the /// delta from that previous group (or the full state if no previous group) -#[derive(Default, Debug, Clone)] -struct StateGroupEntry { +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub struct StateGroupEntry { prev_state_group: Option, state_map: StateMap, } -/// Fetch the entries in state_groups_state (and their prev groups) for the -/// given `room_id` by connecting to the postgres database at `db_url`. -fn get_data_from_db(db_url: &str, room_id: &str) -> BTreeMap { - let conn = Connection::connect(db_url, TlsMode::None).unwrap(); - - let stmt = conn - .prepare( - r#" - SELECT state_group, prev_state_group, type, state_key, event_id - FROM state_groups_state - LEFT JOIN state_group_edges USING (state_group) - WHERE room_id = $1 - "#, - ).unwrap(); - let trans = conn.transaction().unwrap(); - - let mut rows = stmt.lazy_query(&trans, &[&room_id], 100).unwrap(); - - let mut state_group_map: BTreeMap = BTreeMap::new(); - - let mut started = false; - - while let Some(row) = rows.next().unwrap() { - if !started { - started = true; - println!("Started streaming from DB!"); - } - let state_group = row.get(0); - - let entry = state_group_map.entry(state_group).or_default(); - - entry.prev_state_group = row.get(1); - entry.state_map.insert( - &row.get::<_, String>(2), - &row.get::<_, String>(3), - row.get(4), - ); - } - - state_group_map -} - -/// Get any missing state groups from the database -fn get_missing_from_db(db_url: &str, missing_sgs: &[i64]) -> BTreeMap { - let conn = Connection::connect(db_url, TlsMode::None).unwrap(); - - let stmt = conn - .prepare( - r#" - SELECT state_group, prev_state_group - FROM state_group_edges - WHERE state_group = ANY($1) - "#, - ).unwrap(); - let trans = conn.transaction().unwrap(); - - let mut rows = stmt.lazy_query(&trans, &[&missing_sgs], 100).unwrap(); - - let mut state_group_map: BTreeMap = BTreeMap::new(); - - while let Some(row) = rows.next().unwrap() { - let state_group = row.get(0); - - let entry = state_group_map.entry(state_group).or_default(); - - entry.prev_state_group = row.get(1); - } - - state_group_map -} - -/// Get state group entries from the file at `path`. -/// -/// This should be formatted as `|` separated values, with the empty string -/// representing null. (Yes, this is a bit dodgy by means its trivial to get -/// from postgres. We should use a better format). -/// -/// The following invocation produces the correct output: -/// -/// ```bash -/// psql -At synapse > test.data < BTreeMap { - let mut state_group_map: BTreeMap = BTreeMap::new(); - - let f = File::open(path).unwrap(); - let f = BufReader::new(f); - - for line in f.lines() { - let line = line.unwrap(); - - let mut iter = line.split('|'); - - let state_group = iter.next().unwrap().parse().unwrap(); - - let entry = state_group_map.entry(state_group).or_default(); - - let prev_state_group_str = iter.next().unwrap(); - entry.prev_state_group = if prev_state_group_str.is_empty() { - None - } else { - Some(prev_state_group_str.parse().unwrap()) - }; - - entry.state_map.insert( - iter.next().unwrap(), - iter.next().unwrap(), - iter.next().unwrap().to_string(), - ); - } - - state_group_map -} - /// Gets the full state for a given group from the map (of deltas) -fn collapse_state_maps(map: &BTreeMap, state_group: i64) -> StateMap { +pub fn collapse_state_maps( + map: &BTreeMap, + state_group: i64, +) -> StateMap { let mut entry = &map[&state_group]; let mut state_map = StateMap::new(); @@ -169,6 +62,26 @@ fn collapse_state_maps(map: &BTreeMap, state_group: i64) - state_map } +/// Helper struct for parsing the `level_sizes` argument. +struct LevelSizes(Vec); + +impl FromStr for LevelSizes { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + let mut sizes = Vec::new(); + + for size_str in s.split(",") { + let size: usize = size_str + .parse() + .map_err(|_| "Not a comma separated list of numbers")?; + sizes.push(size); + } + + Ok(LevelSizes(sizes)) + } +} + fn main() { let matches = App::new(crate_name!()) .version(crate_version!()) @@ -186,23 +99,26 @@ fn main() { .short("r") .value_name("ROOM_ID") .help("The room to process") - .takes_value(true), + .takes_value(true) + .required(true), ).arg( - Arg::with_name("input") - .short("f") - .value_name("FILE") - .help("File containing dumped state groups") - .takes_value(true), - ).arg( - Arg::with_name("output_diff") + Arg::with_name("output_file") .short("o") .value_name("FILE") - .help("File to output the changes to") + .help("File to output the changes to in SQL") + .takes_value(true), + ).arg( + Arg::with_name("individual_transactions") + .short("t") + .help("Whether to wrap each state group change in a transaction, when writing to file") + .requires("output_file"), + ).arg( + Arg::with_name("level_sizes") + .short("l") + .value_name("LEVELS") + .help("Sizes of each new level in the compression algorithm, as a comma separate list") + .default_value("100,50,25") .takes_value(true), - ).group( - ArgGroup::with_name("target") - .args(&["input", "room_id"]) - .required(true), ).get_matches(); let db_url = matches @@ -210,157 +126,113 @@ fn main() { .expect("db url should be required"); let mut output_file = matches - .value_of("output_diff") + .value_of("output_file") .map(|path| File::create(path).unwrap()); + let room_id = matches + .value_of("room_id") + .expect("room_id should be required since no file"); + + let individual_transactions = matches.is_present("individual_transactions"); + + let level_sizes = value_t_or_exit!(matches, "level_sizes", LevelSizes); // First we need to get the current state groups - let mut state_group_map = if let Some(path) = matches.value_of("input") { - get_data_from_file(path) - } else { - let room_id = matches - .value_of("room_id") - .expect("room_id should be required since no file"); - get_data_from_db(db_url, room_id) - }; + println!("Fetching state from DB for room '{}'...", room_id); + let state_group_map = database::get_data_from_db(db_url, room_id); - // For reasons that escape me some of the state groups appear in the edges - // table, but not in the state_groups_state table. This means they don't - // get included in our DB queries, so we have to fetch any missing groups - // explicitly. Since the returned groups may themselves reference groups - // we don't have we need to do this recursively until we don't find any - // more - loop { - let missing_sgs: Vec<_> = state_group_map - .iter() - .filter_map(|(_sg, entry)| { - if let Some(prev_sg) = entry.prev_state_group { - if state_group_map.contains_key(&prev_sg) { - None - } else { - Some(prev_sg) - } - } else { - None - } - }).collect(); + println!("Number of state groups: {}", state_group_map.len()); - if missing_sgs.is_empty() { - break; - } - - println!("Missing {} state groups", missing_sgs.len()); - - let map = get_missing_from_db(db_url, &missing_sgs); - state_group_map.extend(map.into_iter()); - } - - println!("Number of entries: {}", state_group_map.len()); - - let summed_size = state_group_map + let original_summed_size = state_group_map .iter() .fold(0, |acc, (_, v)| acc + v.state_map.len()); - println!("Number of rows: {}", summed_size); + println!("Number of rows in current table: {}", original_summed_size); - let mut new_state_group_map: BTreeMap = BTreeMap::new(); + // Now we actually call the compression algorithm. - // Now we loop through and create our new state maps from the existing - // ones. - // The existing table is made up of chains of groups at most 100 nodes - // long. At the start of each chain there is a copy of the full state at - // that point. This algorithm adds edges between such "checkpoint" nodes, - // so that there are chains between them. We cap such checkpoint chains to - // a length of 50. - // - // The idea here is that between checkpoint nodes only small subsets of - // state will have actually changed. - // - // (This approach can be generalised by adding more and more layers) + let compressor = Compressor::compress(&state_group_map, &level_sizes.0); - let mut last_checkpoint_opt = None; - let mut checkpoint_length = 0; + let new_state_group_map = compressor.new_state_group_map; - for (state_group, entry) in &state_group_map { - if entry.prev_state_group.is_none() { - // We're at a checkpoint node. If this is our first checkpoint - // node then there isn't much to do other than mark it. - let mut added_to_chain = false; - if let Some(ref last_checkpoint) = last_checkpoint_opt { - let checkpoint_entry = &state_group_map[last_checkpoint]; + // Done! Now to print a bunch of stats. - // We need to ensure that that aren't any entries in the - // previous checkpoint node that aren't in the state at this - // point, since the table schema doesn't support the idea of - // "deleting" state in the deltas. - // - // Note: The entry.state_map will be the full state here, rather - // than just the delta since prev_state_group is None. - if checkpoint_entry - .state_map - .keys() - .all(|(t, s)| entry.state_map.contains_key(t, s)) - { - // We create the new map by filtering out entries that match - // those in the previous checkpoint state. - let new_map: StateMap = entry - .state_map - .iter() - .filter(|((t, s), e)| checkpoint_entry.state_map.get(t, s) != Some(e)) - .map(|((t, s), e)| ((t, s), e.clone())) - .collect(); + let compressed_summed_size = new_state_group_map + .iter() + .fold(0, |acc, (_, v)| acc + v.state_map.len()); - // If we have an output file write the changes we've made - if let Some(ref mut fs) = output_file { - writeln!(fs, "edge_addition {} {}", state_group, *last_checkpoint).unwrap(); - for ((t, s), e) in new_map.iter() { - writeln!(fs, "state_replace {} {} {} {}", state_group, t, s, e) - .unwrap(); + let ratio = (compressed_summed_size as f64) / (original_summed_size as f64); + + println!( + "Number of rows after compression: {} ({:.2}%)", + compressed_summed_size, + ratio * 100. + ); + + println!("Compression Statistics:"); + println!( + " Number of forced resets due to lacking prev: {}", + compressor.stats.resets_no_suitable_prev + ); + println!( + " Number of compressed rows caused by the above: {}", + compressor.stats.resets_no_suitable_prev_size + ); + println!( + " Number of state groups changed: {}", + compressor.stats.state_groups_changed + ); + + // If we are given an output file, we output the changes as SQL. If the + // `individual_transactions` argument is set we wrap each change to a state + // group in a transaction. + + if let Some(output) = &mut output_file { + for (sg, old_entry) in &state_group_map { + let new_entry = &new_state_group_map[sg]; + + if old_entry != new_entry { + if individual_transactions { + writeln!(output, "BEGIN;"); + } + + writeln!( + output, + "DELETE FROM state_group_edges WHERE state_group = {};", + sg + ); + + if let Some(prev_sg) = new_entry.prev_state_group { + writeln!(output, "INSERT INTO state_group_edges (state_group, prev_state_group) VALUES ({}, {});", sg, prev_sg); + } + + writeln!( + output, + "DELETE FROM state_groups_state WHERE state_group = {};", + sg + ); + if new_entry.state_map.len() > 0 { + writeln!(output, "INSERT INTO state_groups_state (state_group, room_id, type, state_key, event_id) VALUES"); + let mut first = true; + for ((t, s), e) in new_entry.state_map.iter() { + if first { + write!(output, " "); + first = false; + } else { + write!(output, " ,"); } + writeln!(output, "({}, '{}', '{}', '{}', '{}')", sg, room_id, t, s, e); } - - new_state_group_map.insert( - *state_group, - StateGroupEntry { - prev_state_group: Some(*last_checkpoint), - state_map: new_map, - }, - ); - - added_to_chain = true; - } else { - new_state_group_map.insert(*state_group, entry.clone()); + writeln!(output, ";"); } - } else { - new_state_group_map.insert(*state_group, entry.clone()); - } - last_checkpoint_opt = Some(*state_group); - - // If we've added to the checkpoint chain we increment the length, - // otherwise it gets reset to zero. - if added_to_chain { - checkpoint_length += 1; - } else { - checkpoint_length = 0; + if individual_transactions { + writeln!(output, "COMMIT;"); + } + writeln!(output); } - - // If the chain is longer than 50 then lets reset to create a new - // chain. - if checkpoint_length >= 50 { - checkpoint_length = 0; - last_checkpoint_opt = None; - } - } else { - new_state_group_map.insert(*state_group, entry.clone()); } } - let summed_size = new_state_group_map - .iter() - .fold(0, |acc, (_, v)| acc + v.state_map.len()); - - println!("Number of rows compressed: {}", summed_size); - // Now let's iterate through and assert that the state for each group // matches between the two versions. state_group_map