diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..420a518 --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +ignore = + ; except + E722, + ; inline regex + W605, + ; long lines + E501, + ; too complex + C901 +max-complexity = 10 +max-line-length = 120 +application-import-names = flake8 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 9f44b89..1b8fd9a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,12 +4,13 @@ *.egg *.egg-info dist +sdist +deb_dist build eggs parts bin var -sdist develop-eggs .installed.cfg @@ -42,3 +43,8 @@ _build # Pyenv .python-version + +# Tinibird +bl-* +out-* +.e diff --git a/CHANGELOG.md b/CHANGELOG.md index 5425f30..fb68cc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +# clickhouse-mysql 2019-09-03 + +## improvements +* fix --src-tables-where-clauses to accept both filenames (for long where-clauses) and where-cluases themselves (for shorted clauses) + +## bugfixes +* fix --src-port CLI option +* ensure UTF8 for source migration + +# clickhouse-mysql 2019-03-25 + +## new features +* added new CLI option `--dst-schema` - make table full name change to `schema`.`db__table` +* added new CLI option `--dst-cluster` - support table create on cluster +* added new CLI option `--dst-distribute` - make table full name change to `schema_all`.`db__table_all`,and engine change to Distributed + # clickhouse-mysql 2018-03-14 ## new features @@ -14,4 +30,4 @@ ## bugfixes * config files vs CLI options order fixed - \ No newline at end of file + diff --git a/README.md b/README.md index be2b7be..123796a 100644 --- a/README.md +++ b/README.md @@ -1,1250 +1,4 @@ # clickhouse-mysql-data-reader ---- - -# Table of Contents - - * [Introduction](#introduction) - * [Requirements and Installation](#requirements-and-installation) - * [Dev Installation](#dev-installation) - * [RPM Installation](#rpm-installation) - * [PyPi Installation](#pypi-installation) - * [GitHub-based Installation - Clone Sources](#github-based-installation---clone-sources) - * [MySQL setup](#mysql-setup) - * [Quick Start](#quick-start) - * [Operation](#operation) - * [Requirements and Limitations](#requirements-and-limitations) - * [Operation General Schema](#operation-general-schema) - * [Performance](#performance) - * [Examples](#examples) - * [Base Example](#base-example) - * [MySQL Migration Case 1 - with Tables Lock](#mysql-migration-case-1---with-tables-lock) - * [MySQL Migration Case 1 - Create ClickHouse Table](#mysql-migration-case-1---create-clickhouse-table) - * [MySQL Migration Case 1 - Migrate Existing Data](#mysql-migration-case-1---migrate-existing-data) - * [MySQL Migration Case 1 - Listen For New Data](#mysql-migration-case-1---listen-for-new-data) - * [MySQL Migration Case 2 - without Tables Lock](#mysql-migration-case-2---without-tables-lock) - * [MySQL Migration Case 2 - Create ClickHouse Table](#mysql-migration-case-2---create-clickhouse-table) - * [MySQL Migration Case 2 - Listen For New Data](#mysql-migration-case-2---listen-for-new-data) - * [MySQL Migration Case 2 - Migrate Existing Data](#mysql-migration-case-2---migrate-existing-data) - * [airline.ontime Test Case](#airlineontime-test-case) - * [airline.ontime Data Set in CSV files](#airlineontime-data-set-in-csv-files) - * [airline.ontime MySQL Table](#airlineontime-mysql-table) - * [airline.ontime ClickHouse Table](#airlineontime-clickhouse-table) - * [airline.ontime Data Reader](#airlineontime-data-reader) - * [airline.ontime Data Importer](#airlineontime-data-importer) - * [Testing](#testing) - * [Testing General Schema](#testing-general-schema) - * [MySQL Data Types](#mysql-data-types) - * [ClickHouse Data Types](#clickhouse-data-types) - * [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) - * [MySQL Test Tables](#mysql-test-tables) - * [ClickHouse Test Tables](#clickhouse-test-tables) - ---- - -# Introduction - -Utility to import data into ClickHouse from MySQL (mainly) and/or CSV files - -# Requirements and Installation - -Datareader requires at least **Python 3.4** with additional modules to be installed. -In most distributions Python 3 have `pip` utility named as `pip3`, so we'll use this naming. -However, you may have it called differently. - -Datareader can be installed either from `github` repo or from `pypi` repo. - -## Dev Installation -```bash -sudo yum install -y rpm-build -sudo yum install -y epel-release -sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm -curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash - -sudo yum install -y python34-pip python34-devel python34-setuptools - -./package_rpm_distr.sh -./pack/build.sh -ls -l ./build/bdist.linux-x86_64/rpm/RPMS/noarch/ -sudo yum install ./build/bdist.linux-x86_64/rpm/RPMS/noarch/clickhouse-mysql-* -``` - -## RPM Installation -**Tested on CentOS 7** - -Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) -More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) -```bash -curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash -``` -Install EPEL (for `python3`) and MySQL (for `libmysqlclient`) repos -```bash -sudo yum install -y epel-release -sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm -``` - -If you do not have EPEL available in your repos, install it directly from EPEL site -```bash -sudo yum install -y https://download.fedoraproject.org/pub/epel/7/x86_64/Packages/e/epel-release-7-11.noarch.rpm -``` - -Install data reader from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) -```bash -sudo yum install -y clickhouse-mysql -``` -clickhouse packages would also be installed as dependencies. - -Prepare config file - copy **example** file into production and edit it. -```bash -sudo cp /etc/clickhouse-mysql/clickhouse-mysql-example.conf /etc/clickhouse-mysql/clickhouse-mysql.conf -sudo vim /etc/clickhouse-mysql/clickhouse-mysql.conf -``` - -Start service -```bash -sudo service clickhouse-mysql start -``` - -## PyPi Installation -In case you need just to use the app - this is the most convenient way to go. - -Install dependencies. -MySQL repo (for `mysql-community-devel`) -```bash -sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm -``` -epel (for `python3`) -```bash -sudo yum install -y epel-release -``` - -clickhouse-client (for `clickhouse-client`) from Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) -More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) -```bash -curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash -``` -```bash -sudo yum install -y clickhouse-client -``` - -and direct dependencies: -```bash -sudo yum install -y mysql-community-devel -sudo yum install -y mariadb-devel -sudo yum install -y gcc -sudo yum install -y python34-devel python34-pip -``` - -Install data reader -```bash -sudo pip3 install clickhouse-mysql -``` - -Now we are able to call datareader as an app - perform last installation steps - install service files, etc -```bash -[user@localhost ~]$ which clickhouse-mysql -/usr/bin/clickhouse-mysql -/usr/bin/clickhouse-mysql --install -``` - -## GitHub-based Installation - Clone Sources -In case you'd like to play around with the sources this is the way to go. - -Install dependencies: - -`MySQLdb` package is used for communication with MySQL: -```bash -pip3 install mysqlclient -``` - -`mysql-replication` package is used for communication with MySQL also: -[https://github.com/noplay/python-mysql-replication](https://github.com/noplay/python-mysql-replication) -```bash -pip3 install mysql-replication -``` - -`clickhouse-driver` package is used for communication with ClickHouse: -[https://github.com/mymarilyn/clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) -```bash -pip3 install clickhouse-driver -``` - -Clone sources from github -```bash -git clone https://github.com/Altinity/clickhouse-mysql-data-reader -``` - -## MySQL setup - -Also the following (at least one of) MySQL privileges are required for this operation: `SUPER`, `REPLICATION CLIENT` - -```mysql -CREATE USER 'reader'@'%' IDENTIFIED BY 'qwerty'; -CREATE USER 'reader'@'127.0.0.1' IDENTIFIED BY 'qwerty'; -CREATE USER 'reader'@'localhost' IDENTIFIED BY 'qwerty'; -GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'%'; -GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'127.0.0.1'; -GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'localhost'; -FLUSH PRIVILEGES; -``` - -Also the following MySQL config options are required: -```ini -[mysqld] -# mandatory -server-id = 1 -log_bin = /var/lib/mysql/bin.log -binlog-format = row # very important if you want to receive write, update and delete row events -# optional -expire_logs_days = 30 -max_binlog_size = 768M -# setup listen address -bind-address = 0.0.0.0 -``` - -# Quick Start - -Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) and want to migrate it into ClickHouse. - -Steps to do: - - * Setup MySQL access as described in [MySQL setup](#mysql-setup) - * Run data reader as following: - -```bash -clickhouse-mysql \ - --src-server-id=1 \ - --src-wait \ - --nice-pause=1 \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --src-tables=airline.ontime \ - --dst-host=127.0.0.1 \ - --dst-create-table \ - --migrate-table \ - --pump-data \ - --csvpool -``` - -Expected results are: - * automatically create target table in ClickHouse (if possible) - * migrate existing data from MySQL to ClickHouse - * after migration completed, listen for new events to come and pump data from MySQL into ClickHouse - -Options description - * `--src-server-id` - Master's server id - * `--src-wait` - wait for new data to come - * `--nice-pause=1` - when no data available sleep for 1 second - * `--src-host=127.0.0.1` - MySQL source host - * `--src-user=reader` - MySQL source user (remember about PRIVILEGES for this user) - * `--src-password=qwerty` - MySQL source password (remember about PRIVILEGES for this user) - * `--src-tables=airline.ontime` - list of MySQL source tables to process - * `--dst-host=127.0.0.1` - ClickHouse host - * `--dst-create-table` - create target table automatically - * `--migrate-table` - migrate source tables - * `--pump-data` - pump data from MySQL into ClickHouse after data migrated - * `--csvpool` - make pool of csv files while pumping data (assumes `--mempool` also) - -Choose any combination of `--pump-data`, `--migrate-table`, `--create-table-sql`, `--dst-create-table` - -# Operation - -## Requirements and Limitations - -Data reader understands INSERT SQL statements only. In practice this means that: - * You need to create required table in ClickHouse before starting data read procedure. More on how to create target ClickHouse table: [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) - * From all DML statements INSERT-only are handled, which means: - * UPDATE statements are not handled - meaning UPDATEs within MySQL would not be relayed into ClickHouse - * DELETE statements are not handled - meaning DELETEs within MySQL would not be relayed into ClickHouse - * DDL statements are not handled, which means: - * source table structure change (ALTER TABLE) has to be handled externally and can lead to insertion errors - -## Operation General Schema - - * Step 1. Data Reader reads data from the source event-by-event (for MySQL binlog) or line-by-line (file). - * Step 2. **OPTIONAL** Caching in memory pool. Since ClickHouse prefers to get data in bundles (row-by-row insertion is extremely slow), we need to introduce some caching. - Cache can be flushed by either of: - * number of rows in cache - * number of events in cache - * time elapsed - * data source depleted - * Step 3. **OPTIONAL** Writing CSV file. Sometimes it is useful to have data also represented as a file - * Step 4. Writing data into ClickHouse. Depending on the configuration of the previous steps data are written into ClickHouse by either of: - * directly event-by-event or line-by-line - * from memory cache as a bulk insert operation - * from CSV file via `clickhouse-client` - -## Performance - -`pypy` significantly improves performance. You should try it. Really. Up to **10 times performance boost** can be achieved. -For example you can start with [Portable PyPy distribution for Linux](https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) - - use [Python 3.x release](https://github.com/squeaky-pl/portable-pypy#latest-python-35-release) -Unpack it into your place of choice. - -```bash -[user@localhost ~]$ ls -l pypy3.5-5.9-beta-linux_x86_64-portable -total 32 -drwxr-xr-x 2 user user 140 Oct 24 01:14 bin -drwxr-xr-x 5 user user 4096 Oct 3 11:57 include -drwxr-xr-x 4 user user 4096 Oct 3 11:57 lib -drwxr-xr-x 13 user user 4096 Oct 3 11:56 lib_pypy -drwxr-xr-x 3 user user 15 Oct 3 11:56 lib-python --rw-r--r-- 1 user user 11742 Oct 3 11:56 LICENSE --rw-r--r-- 1 user user 1296 Oct 3 11:56 README.rst -drwxr-xr-x 14 user user 4096 Oct 24 01:16 site-packages -drwxr-xr-x 2 user user 195 Oct 3 11:57 virtualenv_support -``` - -Install `pip` -```bash -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -m ensurepip -``` -Install required modules -```bash -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysql-replication -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install clickhouse-driver -``` -`mysqlclient` may require to install `libmysqlclient-dev` and `gcc` -```bash -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysqlclient -``` -Install them if need be -```bash -sudo apt-get install libmysqlclient-dev -``` -```bash -sudo apt-get install gcc -``` - -Now you can run data reader via `pypy` -```bash -/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy clickhouse-mysql -``` - -# Examples - -## Base Example - -Let's walk over test example of tool launch command line options. -This code snippet is taken from shell script (see more details in [airline.ontime Test Case](#airlineontime-test-case)) - -```bash -$PYTHON clickhouse-mysql ${*:1} \ - --src-server-id=1 \ - --src-resume \ - --src-wait \ - --nice-pause=1 \ - --log-level=info \ - --log-file=ontime.log \ - --src-host=127.0.0.1 \ - --src-user=root \ - --dst-host=127.0.0.1 \ - --csvpool \ - --csvpool-file-path-prefix=qwe_ \ - --mempool-max-flush-interval=60 \ - --mempool-max-events-num=1000 \ - --pump-data -``` -Options description - * `--src-server-id` - Master's server id - * `--src-resume` - resume data loading from the previous point. When the tool starts - resume from the end of the log - * `--src-wait` - wait for new data to come - * `--nice-pause=1` - when no data available sleep for 1 second - * `--log-level=info` - log verbosity - * `--log-file=ontime.log` - log file name - * `--src-host=127.0.0.1` - MySQL source host - * `--src-user=root` - MySQL source user (remember about PRIVILEGES for this user) - * `--dst-host=127.0.0.1` - ClickHouse host - * `--csvpool` - make pool of csv files (assumes `--mempool` also) - * `--csvpool-file-path-prefix=qwe_` - put these CSV files having `qwe_` prefix in `CWD` - * `--mempool-max-flush-interval=60` - flush mempool at least every 60 seconds - * `--mempool-max-events-num=1000` - flush mempool at least each 1000 events (not rows, but events) - * `--pump-data` - pump data from MySQL into ClickHouse - -## MySQL Migration Case 1 - with Tables Lock - -Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) with multiple rows: - -```mysql -mysql> SELECT COUNT(*) FROM airline.ontime; -+----------+ -| count(*) | -+----------+ -| 7694964 | -+----------+ -``` - -MySQL is already configured as [described earlier](#mysql-setup). -Let's migrate existing data to ClickHouse and listen for newly coming data in order to migrate them to CLickHouse on-the-fly. - -### MySQL Migration Case 1 - Create ClickHouse Table - -Create ClickHouse table description -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=Qwerty1# \ - --create-table-sql-template \ - --with-create-database \ - --src-only-table=airline.ontime > create_clickhouse_table_template.sql -``` -We have **CREATE TABLE** template stored in `create_clickhouse_table_template.sql` file. -```bash -vim create_clickhouse.sql -``` -Setup sharding field and primary key. These columns must not be `Nullable` -```bash mysql -...cut... - `Year` UInt16, -...cut... - `FlightDate` Date, -...cut... - `Month` UInt8, -...cut... -) ENGINE = MergeTree(FlightDate, (FlightDate, Year, Month), 8192) -``` - -Create table in ClickHouse -```bash -clickhouse-client -mn < create_clickhouse_table_template.sql -``` - -### MySQL Migration Case 1 - Migrate Existing Data - -Lock MySQL in order to avoid new data coming while data migration is running. Keep `mysql` client open during the whole process -```mysql -mysql> FLUSH TABLES WITH READ LOCK; -``` - -Migrate data -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=Qwerty1# \ - --migrate-table \ - --src-only-table=airline.ontime \ - --dst-host=127.0.0.1 -``` -This may take some time. -Check all data is in ClickHouse -```mysql -:) select count(*) from airline.ontime; - -SELECT count(*) -FROM airline.ontime - -┌─count()─┐ -│ 7694964 │ -└─────────┘ -``` - -### MySQL Migration Case 1 - Listen For New Data - -Start `clickhouse-mysql` as a replication slave, so it will listen for new data coming: -```bash -clickhouse-mysql \ - --src-server-id=1 \ - --src-resume \ - --src-wait \ - --nice-pause=1 \ - --src-host=127.0.0.1 \ - --src-user=reader --src-password=Qwerty1# \ - --src-only-table=airline.ontime \ - --dst-host=127.0.0.1 \ - --csvpool \ - --csvpool-file-path-prefix=qwe_ \ - --mempool-max-flush-interval=60 \ - --mempool-max-events-num=10000 \ - --pump-data -``` - -Allow new data to be inserted into MySQL - i.e. unlock tables. - -```mysql -mysql> UNLOCK TABLES; -``` - -Insert some data into MySQL. For example, via [clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) script - -```mysql -mysql> SELECT COUNT(*) FROM airline.ontime; -+----------+ -| count(*) | -+----------+ -| 10259952 | -+----------+ -``` - -Replication will be pumping data from MySQL into ClickHouse in background and in some time we'll see the following picture in ClickHouse: -```mysql -:) select count(*) from airline.ontime; - -SELECT count(*) -FROM airline.ontime - -┌──count()─┐ -│ 10259952 │ -└──────────┘ -``` - -## MySQL Migration Case 2 - without Tables Lock -Suppose we'd like to migrate multiple log tables of the same structure named as `log_XXX` - i.e. all of them have `log_` name prefix -into one ClickHouse table named `logunified` of the following structure -```sql -DESCRIBE TABLE logunified - -┌─name─┬─type───┬─default_type─┬─default_expression─┐ -│ id │ UInt64 │ │ │ -│ day │ Date │ │ │ -│ str │ String │ │ │ -└──────┴────────┴──────────────┴────────────────────┘ -``` -Log tables by nature are `INSERT`-only tables. Let's migrate these tables. - -### MySQL Migration Case 2 - Create ClickHouse Table -Prepare tables templates in `create_clickhouse.sql` file -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --create-table-sql-template \ - --with-create-database \ - --src-tables-prefixes=db.log_ > create_clickhouse_table_template.sql -``` -Edit templates -```bash -vim create_clickhouse_table_template.sql -``` -And create tables in ClickHouse -```bash - -clickhouse-client -mn < create_clickhouse_table_template.sql -``` - -### MySQL Migration Case 2 - Listen For New Data -```bash -clickhouse-mysql \ - --src-server-id=1 \ - --src-resume \ - --src-wait \ - --nice-pause=1 \ - --log-level=info \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --src-tables-prefixes=log_ \ - --dst-host=127.0.0.1 \ - --dst-table=logunified \ - --csvpool \ - --pump-data -``` -Pay attention to -```bash - --src-tables-prefixes=log_ \ - --dst-table=logunified \ -``` -Replication data from multiple tables into one destination table `--dst-table=logunified`. - -Monitor logs for `first row in replication` notification of the following structure: -```bash -INFO:first row in replication db.log_201801_2 -column: id=1727834 -column: day=2018-01-20 -column: str=data event 3 -``` -These records help us to create SQL statement for Data Migration process. -Sure, we can peek into MySQL database manually in order to understand what records would be the last to be copied by migration process. - -### MySQL Migration Case 2 - Migrate Existing Data - -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --migrate-table \ - --src-tables-prefixes=db.log_ \ - --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ - --dst-host=127.0.0.1 \ - --dst-table=logunified \ - --csvpool -``` - -Pay attention to -```bash - --src-tables-prefixes=db.log_ \ - --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ - --dst-table=logunified \ -``` -Migration subset of data described in `--src-tables-where-clauses` files from multiple tables into one destination table `--dst-table=logunified` - -Values for where clause in `db.log_201801_1.sql` are fetched from `first row in replication` log: `INFO:first row in replication db.log_201801_1` -```bash -cat db.log_201801_1.sql -id < 1727831 -``` - -Result: -```sql -:) select count(*) from logunified; - -SELECT count(*) -FROM logunified - -┌──count()─┐ -│ 12915568 │ -└──────────┘ - -``` - -## airline.ontime Test Case - -Main Steps - * Download airline.ontime dataset - * Create airline.ontime MySQL table - * Create airline.ontime ClickHouse table - * Start data reader (utility to migrate data MySQL -> ClickHouse) - * Start data importer (utility to import data into MySQL) - * Check how data are loaded into ClickHouse - -### airline.ontime Data Set in CSV files -Run [download script](clickhouse_mysql_examples/airline_ontime_data_download.sh) - -You may want to adjust dirs where to keep `ZIP` and `CSV` file - -In `airline_ontime_data_download.sh` edit these lines: -```bash -... -ZIP_FILES_DIR="zip" -CSV_FILES_DIR="csv" -... -``` -You may want to adjust number of files to download (In case downloading all it may take some time). - -Specify year and months range as you wish: -```bash -... -echo "Download files into $ZIP_FILES_DIR" -for year in `seq 1987 2017`; do - for month in `seq 1 12`; do -... -``` - -```bash -./airline_ontime_data_download.sh -``` -Downloading can take some time. - -### airline.ontime MySQL Table -Create MySQL table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql): -```bash -mysql -uroot -p < clickhouse_mysql_examples/airline_ontime_schema_mysql.sql -``` - -### airline.ontime ClickHouse Table -Create ClickHouse table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_ch.sql](clickhouse_mysql_examples/airline_ontime_schema_ch.sql): -```bash -clickhouse-client -mn < clickhouse_mysql_examples/airline_ontime_schema_ch.sql -``` - -### airline.ontime Data Reader -Run [datareader script](clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh) - -You may want to adjust `PYTHON` path and source and target hosts and usernames -```bash -... -PYTHON=python3.6 -PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -... -``` -```bash -... - --src-host=127.0.0.1 \ - --src-user=root \ - --dst-host=127.0.0.1 \ -... -``` -```bash -./airline_ontime_data_mysql_to_ch_reader.sh -``` - -### airline.ontime Data Importer -Run [data importer script](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) - -You may want to adjust `CSV` files location, number of imported files and MySQL user/password used for import -```bash -... -# looking for csv files in this dir -FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" - -# limit import to this number of files -FILES_TO_IMPORT_NUM=3 -... -``` -```bash -... - -u root \ -... -``` - -```bash -./airline_ontime_mysql_data_import.sh -``` - -# Testing - -## Testing General Schema - -### MySQL Data Types - -#### Numeric Types - - * `BIT` the number of bits per value, from 1 to 64 - * `TINYINT` -128 to 127. The unsigned range is 0 to 255 - * `BOOL`, `BOOLEAN` synonyms for `TINYINT(1)` - * `SMALLINT` -32768 to 32767. The unsigned range is 0 to 65535 - * `MEDIUMINT` -8388608 to 8388607. The unsigned range is 0 to 16777215. - * `INT`, `INTEGER` -2147483648 to 2147483647. The unsigned range is 0 to 4294967295 - * `BIGINT` -9223372036854775808 to 9223372036854775807. The unsigned range is 0 to 18446744073709551615 - - * `SERIAL` is an alias for `BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE`. - * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` A packed ?exact? fixed-point number - * `FLOAT` Permissible values are -3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38 - * `DOUBLE`, `REAL` Permissible values are -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308 - - -#### Date and Time Types - - * `DATE` The supported range is '1000-01-01' to '9999-12-31' - * `DATETIME` The supported range is '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.999999' - * `TIMESTAMP` The range is '1970-01-01 00:00:01.000000' UTC to '2038-01-19 03:14:07.999999' - * `TIME` The range is '-838:59:59.000000' to '838:59:59.000000' - * `YEAR` Values display as 1901 to 2155, and 0000 - -#### String Types - * `CHAR` The range of M is 0 to 255. If M is omitted, the length is 1. - * `VARCHAR` The range of M is 0 to 65,535 - * `BINARY` similar to CHAR - * `VARBINARY` similar to VARCHAR - * `TINYBLOB` maximum length of 255 - * `TINYTEXT` maximum length of 255 - * `BLOB` maximum length of 65,535 - * `TEXT` maximum length of 65,535 - * `MEDIUMBLOB` maximum length of 16,777,215 - * `MEDIUMTEXT` maximum length of 16,777,215 - * `LONGBLOB` maximum length of 4,294,967,295 or 4GB - * `LONGTEXT` maximum length of 4,294,967,295 or 4GB - * `ENUM` can have a maximum of 65,535 distinct elements - * `SET` can have a maximum of 64 distinct members - - * `JSON` native JSON data type defined by RFC 7159 - ---- - -### ClickHouse Data Types - - * `Date` number of days since 1970-01-01 - * `DateTime` Unix timestamp - * `Enum8` or `Enum16`. A set of enumerated string values that are stored as `Int8` or `Int16`. The numeric values must be within -128..127 for Enum8 and -32768..32767 for Enum16 - * `Float32`, `Float64` - - * `Int8` -128 127 - * `UInt8` 0 255 - - * `Int16` -32768 32767 - * `UInt16` 0 65535 - - * `Int32` -2147483648 2147483647 - * `UInt32` 0 4294967295 - - * `Int64` -9223372036854775808 9223372036854775807 - * `UInt64` 0 18446744073709551615 - - * `FixedString(N)` string of `N` bytes (not characters or code points) - * `String` The length is not limited. The value can contain an arbitrary set of bytes, including null bytes - ---- - -### MySQL -> ClickHouse Data Types Mapping - -#### Numeric Types - - * `BIT` -> ??? (possibly `String`?) - * `TINYINT` -> `Int8`, `UInt8` - * `BOOL`, `BOOLEAN` -> `UInt8` - * `SMALLINT` -> `Int16`, `UInt16` - * `MEDIUMINT` -> `Int32`, `UInt32` - * `INT`, `INTEGER` -> `Int32`, `UInt32` - * `BIGINT` -> `Int64`, `UInt64` - - * `SERIAL` -> `UInt64` - * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` -> ???? (possibly `String`?) - * `FLOAT` -> `Float32` - * `DOUBLE`, `REAL` -> `Float64` - - -#### Date and Time Types - - * `DATE` -> `Date` (for valid values) or `String` - `Date` Allows storing values from just after the beginning of the Unix Epoch - to the upper threshold defined by a constant at the compilation stage - (currently, this is until the year 2038, but it may be expanded to 2106) - * `DATETIME` -> `DateTime` (for valid values) or `String` - * `TIMESTAMP` -> `DateTime` - * `TIME` -> ????? (possibly `String`?) - * `YEAR` -> `UInt16` - - -#### String Types - - * `CHAR` -> `FixedString` - * `VARCHAR` -> `String` - * `BINARY` -> `String` - * `VARBINARY` -> `String` - * `TINYBLOB` -> `String` - * `TINYTEXT` -> `String` - * `BLOB` -> `String` - * `TEXT` -> `String` - * `MEDIUMBLOB` -> `String` - * `MEDIUMTEXT` -> `String` - * `LONGBLOB` -> `String` - * `LONGTEXT` -> `String` - -#### Set Types - * `ENUM` -> `Enum8`, `Enum16` - * `SET` -> `Array(Int8)` - -#### Custom Types - * `JSON` -> ?????? (possibly `String`?) - - -### MySQL Test Tables - -We have to separate test table into several ones because of this error, produced by MySQL: -```text -ERROR 1118 (42000): Row size too large. The maximum row size for the used table type, not counting BLOBs, is 65535. This includes storage overhead, check the manual. You have to change some columns to TEXT or BLOBs -``` - -```mysql -CREATE TABLE datatypes( - - bit_1 BIT(1), - bit_2 BIT(64), - - tinyint_1 TINYINT COMMENT '-128 to 127', - u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 BOOL, - bool_2 BOOLEAN, - - smallint_1 SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 INT COMMENT '-2147483648 to 2147483647', - u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', - timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', - time_1 TIME COMMENT '-838:59:59 to 838:59:59', - year_1 YEAR COMMENT '1901 to 2155, and 0000', - - char_0 CHAR(0), - char_1 CHAR(1), - char_2 CHAR(255), - - varchar_0 VARCHAR(0), - varchar_1 VARCHAR(1), - - binary_0 BINARY(0) COMMENT 'similar to CHAR', - binary_1 BINARY(1) COMMENT 'similar to CHAR', - binary_2 BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' -) -; - -CREATE TABLE enum_datatypes( - enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' -) -; - -CREATE TABLE set_datatypes( - set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members' -) -; - -CREATE TABLE json_datatypes( - json_1 JSON -) -; - -CREATE TABLE long_varchar_datatypes( - varchar_2 VARCHAR(65532) -) -; - -CREATE TABLE long_varbinary_datatypes( - varbinary_2 VARBINARY(65532) COMMENT 'similar to VARCHAR' -) -; -``` - - -```mysql --- in order to be able to set timestamp = '1970-01-01 00:00:01' -set time_zone='+00:00'; -``` - -Insert minimal acceptable values into the test table: - -```mysql --- MIN values -INSERT INTO datatypes SET - - bit_1 = 0b0, -- BIT(1), - bit_2 = 0b0, -- BIT(64), - - tinyint_1 = -128, -- TINYINT COMMENT '-128 to 127', - u_tinyint_1 = 0, -- TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 = FALSE, -- BOOL, - bool_2 = FALSE, -- BOOLEAN, - - smallint_1 = -32768, -- SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 = 0, -- SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 = -8388608, -- MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 = 0, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 = -2147483648, -- INT COMMENT '-2147483648 to 2147483647', - u_int_1 = 0, -- INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 = -2147483648, -- INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 = 0, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 = -9223372036854775808, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 = 0, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 = 0, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 = -9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 = -9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 = -9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 = -9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 = -3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 = 0, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 = -1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 = 0, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 = -1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 = 0, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 = '1970-01-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 = '1970-01-01 00:00:00', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', - timestamp_1 = '1970-01-01 00:00:01', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', - time_1 = '-838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', - year_1 = 1901, -- YEAR COMMENT '1901 to 2155, and 0000', - - char_0 = '', -- CHAR(0), - char_1 = '', -- CHAR(1), - char_2 = '', -- CHAR(255), - - varchar_0 = '', -- VARCHAR(0), - varchar_1 = '', -- VARCHAR(1), - - binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', - binary_1 = '', -- BINARY(1) COMMENT 'similar to CHAR', - binary_2 = '', -- BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 = '', -- VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 = '', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 = '', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 = '', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 = '', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 = '', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 = '', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 = '', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 = '' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' -; - -INSERT INTO enum_datatypes SET - enum_1 = NULL -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' -; - -INSERT INTO set_datatypes SET - set_1 = '' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members' -; - -INSERT INTO json_datatypes SET - json_1 = '{}' -- JSON -; - -INSERT INTO long_varchar_datatypes SET - varchar_2 = "" -; - -INSERT INTO long_varbinary_datatypes SET - varbinary_2 = "" -; -``` - -Insert maximum acceptable values into the test table: - -```mysql --- MAX values -INSERT INTO datatypes SET - - bit_1 = 0b1, -- BIT(1), - bit_2 = 0b1111111111111111111111111111111111111111111111111111111111111111, -- BIT(64), - - tinyint_1 = 127, -- TINYINT COMMENT '-128 to 127', - u_tinyint_1 = 255, -- TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 = TRUE, -- BOOL, - bool_2 = TRUE, -- BOOLEAN, - - smallint_1 = 32767, -- SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 = 65535, -- SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 = 8388607, -- MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 = 16777215, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 = 2147483647, -- INT COMMENT '-2147483648 to 2147483647', - u_int_1 = 4294967295, -- INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 = 2147483647, -- INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 = 4294967295, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 = 9223372036854775807, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 = 18446744073709551615, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 = 18446744073709551615, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 = 9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 = 9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 = 9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 = 9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 = 3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 = 3.402823466E+38, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 = 1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 = 1.7976931348623157E+308, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 = 1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 = 1.7976931348623157E+308, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 = '2149-06-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 = '2106-02-01 23:59:59', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', - timestamp_1 = '2038-01-19 03:14:07', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', - time_1 = '838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', - year_1 = 2155, -- YEAR COMMENT '1901 to 2155, and 0000', - - char_0 = '', -- CHAR(0), - char_1 = 'a', -- CHAR(1), - char_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- CHAR(255), - - varchar_0 = '', -- VARCHAR(0), - varchar_1 = 'a', -- VARCHAR(1), - - binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', - binary_1 = 'a', -- BINARY(1) COMMENT 'similar to CHAR', - binary_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 = 'a', -- VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 = 'a', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 = 'a', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 = 'a', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 = 'a', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 = 'a', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 = 'a', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 = 'a', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 = 'a' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' -; - -INSERT INTO enum_datatypes SET - enum_1 = 'a' -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' -; - -INSERT INTO set_datatypes SET - set_1 = 'a,b,c' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members', -; - -INSERT INTO json_datatypes SET - json_1 = '{"a":1, "b":2, "c":3}' -- JSON -; - -INSERT INTO long_varchar_datatypes SET - varchar_2 = "abc" -; - -INSERT INTO long_varbinary_datatypes SET - varbinary_2 = "abc" -; -``` - -### ClickHouse Test Tables - -```sql -CREATE TABLE datatypes( - bit_1 Nullable(String), -- bit_1 BIT(1), - bit_2 Nullable(String), -- bit_2 BIT(64), - - tinyint_1 Nullable(Int8), -- tinyint_1 TINYINT COMMENT '-128 to 127', - u_tinyint_1 Nullable(UInt8), -- u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 Nullable(UInt8), -- bool_1 BOOL, - bool_2 Nullable(UInt8), -- bool_2 BOOLEAN, - - smallint_1 Nullable(Int16), -- smallint_1 SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 Nullable(UInt16), -- u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 Nullable(Int32), -- mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 Nullable(UInt32), -- u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 Nullable(Int32), -- int_1 INT COMMENT '-2147483648 to 2147483647', - u_int_1 Nullable(UInt32), -- u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 Nullable(Int32), -- integer_1 INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 Nullable(UInt32), -- u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 Nullable(Int64), -- bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 Nullable(UInt64), -- u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 Nullable(UInt64), -- serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 Nullable(String), -- decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 Nullable(String), -- dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 Nullable(String), -- fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 Nullable(String), -- numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 Nullable(Float32), -- float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 Nullable(Float32), -- u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 Nullable(Float64), -- double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 Nullable(Float64), -- u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 Nullable(Float64), -- real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 Nullable(Float64), -- u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 Nullable(Date), -- date_1 DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 Nullable(DateTime), -- datetime_1 DATETIME COMMENT '1000-01-01 00:00:00.000000 to 9999-12-31 23:59:59.999999', - timestamp_1 Nullable(DateTime), -- timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01.000000 UTC to 2038-01-19 03:14:07.999999 UTC', - time_1 Nullable(String), -- time_1 TIME COMMENT '-838:59:59.000000 to 838:59:59.000000', - year_1 Nullable(UInt16), -- year_1 YEAR COMMENT '1901 to 2155, and 0000', - - char_0 Nullable(FixedString(1)), -- char_0 CHAR(0), - char_1 Nullable(FixedString(1)), -- char_1 CHAR(1), - char_2 Nullable(FixedString(255)), -- char_2 CHAR(255), - - varchar_0 Nullable(String), -- varchar_0 VARCHAR(0), - varchar_1 Nullable(String), -- varchar_1 VARCHAR(1), - - binary_0 Nullable(String), -- binary_0 BINARY(0) COMMENT 'similar to CHAR', - binary_1 Nullable(String), -- binary_1 BINARY(1) COMMENT 'similar to CHAR', - binary_2 Nullable(String), -- binary_2 BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 Nullable(String), -- varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 Nullable(String), -- varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 Nullable(String), -- tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 Nullable(String), -- tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 Nullable(String), -- blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 Nullable(String), -- text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 Nullable(String), -- mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 Nullable(String), -- mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 Nullable(String), -- longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 Nullable(String) -- longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters', - -) ENGINE = Log -; - -CREATE TABLE enum_datatypes( - enum_1 Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6) -- enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements', -) ENGINE = Memory -; - -CREATE TABLE set_datatypes( - set_1 Array(Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6)) -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', -) ENGINE = Memory -; - -CREATE TABLE set_datatypes( - set_1 String -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', -) ENGINE = Memory -; - - -CREATE TABLE json_datatypes( - json_1 String -- json_1 JSON -) ENGINE = Memory -; - -CREATE TABLE long_varchar_datatypes( - varchar_2 String -) ENGINE = Memory -; - -CREATE TABLE long_varbinary_datatypes( - varbinary_2 String -) ENGINE = Memory -; -``` +- [Manual](docs/manual.md) +- [Usage examples](docs/usage-references.md) diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 87823c3..c46897e 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -16,25 +16,25 @@ def join_lists_into_dict(lists_to_join): [['a=b', 'c=d'], ['e=f', 'z=x'], ] :return: None or dictionary - {'a': 'b', 'c': 'd', 'e': 'f', 'z': 'x'} + {'a': 'b', 'c': 'd', 'e': 'f', 'y': 'z'} """ + # lists_to_join must be a list if not isinstance(lists_to_join, list): return None res = {} - for lst in lists_to_join: - # lst = ['a=b', 'c=d'] - for column_value_pair in lst: - # column_value_value = 'a=b' - column, value = column_value_pair.split('=', 2) - res[column] = value - # res = dict { - # 'col1': 'value1', - # 'col2': 'value2', + # 'name1': 'value1', + # 'name2': 'value2', # } + for _list in lists_to_join: + # _list = ['a=b', 'c=d'] + for name_value_pair in _list: + # name_value_pair contains 'a=b' + name, value = name_value_pair.split('=', 2) + res[name] = value # return with sanity check if len(res) > 0: @@ -52,6 +52,7 @@ def join_lists(lists_to_join): ['a', 'b', 'c', 'd', 'e', 'f'] """ + # lists_to_join must be a list if not isinstance(lists_to_join, list): return None @@ -92,6 +93,10 @@ class CLIOptions(Options): # # general app section # + + 'tb_host': 'https://ui.tinybird.co', + 'tb_token': None, + 'config_file': '/etc/clickhouse-mysql/clickhouse-mysql.conf', 'log_file': None, 'log_level': None, @@ -142,14 +147,17 @@ class CLIOptions(Options): 'dst_user': 'default', 'dst_password': '', 'dst_schema': None, + 'dst_distribute': False, + 'dst_cluster': None, 'dst_table': None, + 'dst_table_prefix': None, 'dst_create_table': False, # # converters section # 'column_default_value': None, - 'column_skip': None, + 'column_skip': [], 'ch_converter_file': None, 'ch_converter_class': None, } @@ -167,6 +175,20 @@ def options(self): # # general app section # + argparser.add_argument( + '--tb-host', + type=str, + default=self.default_options['tb_host'], + help='Tinybird host' + ) + + argparser.add_argument( + '--tb-token', + type=str, + default=self.default_options['tb_token'], + help='Tinybird host' + ) + argparser.add_argument( '--config-file', type=str, @@ -189,13 +211,13 @@ def options(self): '--nice-pause', type=int, default=self.default_options['nice_pause'], - help='make nice pause between attempts to read binlog stream' + help='Make specified (in sec) pause between attempts to read binlog stream' ) argparser.add_argument( '--dry', action='store_true', help='Dry mode - do not do anything that can harm. ' - 'Useful for debugging.' + 'Useful for debugging.' ) argparser.add_argument( '--daemon', @@ -206,13 +228,13 @@ def options(self): '--pid-file', type=str, default=self.default_options['pid_file'], - help='Pid file to be used by app in daemon mode' + help='Pid file to be used by the app in daemon mode' ) argparser.add_argument( '--binlog-position-file', type=str, default=self.default_options['binlog_position_file'], - help='File to write binlog position to' + help='File to write binlog position to during bin log reading and to read position from on start' ) argparser.add_argument( '--mempool', @@ -240,7 +262,8 @@ def options(self): argparser.add_argument( '--csvpool', action='store_true', - help='Cache data in CSV pool files on disk. Requires memory pooling, thus enables --mempool even if it is not explicitly specified' + help='Cache data in CSV pool files on disk. Requires memory pooling, ' + 'thus enables --mempool even if it is not explicitly specified' ) argparser.add_argument( '--csvpool-file-path-prefix', @@ -276,14 +299,19 @@ def options(self): argparser.add_argument( '--migrate-table', action='store_true', - help='Migrate table(s). IMPORTANT!. Target table has to be created in ClickHouse ' - 'or it has to be created with --create-table and possibly with --with-create-database options' - 'See --table-template and --table-create options for additional info.' + help='Migrate table(s). Copy existing data from MySQL table(s) with SELECT statement. ' + 'Binlog is not read during this procedure - just copy data from the src table(s). ' + 'IMPORTANT!. Target table has to be created in ClickHouse ' + 'or it has to be created with --dst-create-table and possibly with --with-create-database options. ' + 'See --create-table-sql-template and --create-table-sql options for additional info. ' ) argparser.add_argument( '--pump-data', action='store_true', - help='Pump data into ClickHouse' + help='Pump data from MySQL binlog into ClickHouse. Copy rows from binlog until the end of binlog reached. ' + 'When end of binlog reached, process ends. ' + 'Use in combination with --src-wait in case would like to continue and wait for new rows ' + 'after end of binlog reached' ) argparser.add_argument( '--install', @@ -328,19 +356,25 @@ def options(self): '--src-schemas', type=str, default=self.default_options['src_schemas'], - help='Comma-separated list of schemas to be used when reading from src. Ex.: db1,db2,db3' + help='Comma-separated list of databases (a.k.a schemas) to be used when reading from src. Ex.: db1,db2,db3' ) argparser.add_argument( '--src-tables', type=str, default=self.default_options['src_tables'], - help='Comma-separated list of tables to be used when reading from src. Ex.: table1,table2,table3' + help='Comma-separated list of tables to be used when reading from src. ' + 'Ex.: table1,table2,table3' + 'Ex.: db1.table1,db2.table2,db3.table3' + 'Ex.: table1,db2.table2,table3' ) argparser.add_argument( '--src-tables-where-clauses', type=str, default=self.default_options['src_tables_where_clauses'], - help='Comma-separated list of WHERE clauses for tables to be migrated. Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4"' + help='Comma-separated list of WHERE clauses for tables to be migrated. ' + 'Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4". ' + 'Accepts both (comma-separated) clause (useful for short clauses) or ' + 'file where clause is located (useful for long clauses)' ) argparser.add_argument( '--src-tables-prefixes', @@ -358,19 +392,21 @@ def options(self): argparser.add_argument( '--src-resume', action='store_true', - help='Resume reading from previous position.' + help='Resume reading from previous position. Previous position is read from `binlog-position-file`' ) argparser.add_argument( '--src-binlog-file', type=str, default=self.default_options['src_binlog_file'], - help='Binlog file to be used when reading from src. Ex.: mysql-bin.000024' + help='Binlog file to be used to read from src. Related to `binlog-position-file`. ' + 'Ex.: mysql-bin.000024' ) argparser.add_argument( '--src-binlog-position', type=int, default=self.default_options['src_binlog_position'], - help='Binlog position to be used when reading from src. Ex.: 5703' + help='Binlog position to be used when reading from src. Related to `binlog-position-file`. ' + 'Ex.: 5703' ) argparser.add_argument( '--src-file', @@ -416,7 +452,22 @@ def options(self): '--dst-schema', type=str, default=self.default_options['dst_schema'], - help='Database/schema to be used when writing to dst. Ex.: db1' + help='Database (a.k.a schema) to be used to create tables in ClickHouse. ' + 'It overwrites source database(s) name(s), so tables in ClickHouse ' + 'would be located in differently named db than in MySQL. ' + 'Ex.: db1' + ) + argparser.add_argument( + '--dst-distribute', + action='store_true', + default=self.default_options['dst_distribute'], + help='Whether to add distribute table' + ) + argparser.add_argument( + '--dst-cluster', + type=str, + default=self.default_options['dst_cluster'], + help='Cluster to be used when writing to dst. Ex.: cluster1' ) argparser.add_argument( '--dst-table', @@ -424,6 +475,12 @@ def options(self): default=self.default_options['dst_table'], help='Table to be used when writing to dst. Ex.: table1' ) + argparser.add_argument( + '--dst-table-prefix', + type=str, + default=self.default_options['dst_table_prefix'], + help='Prefix to be used when creating dst table. Ex.: copy_table_' + ) argparser.add_argument( '--dst-create-table', action='store_true', @@ -439,7 +496,8 @@ def options(self): nargs='*', action='append', default=self.default_options['column_default_value'], - help='Set of key=value pairs for columns default values. Ex.: date_1=2000-01-01 timestamp_1=2002-01-01\ 01:02:03' + help='Set of key=value pairs for columns default values. ' + 'Ex.: date_1=2000-01-01 timestamp_1=2002-01-01\ 01:02:03' ) argparser.add_argument( '--column-skip', @@ -468,6 +526,11 @@ def options(self): # # general app section # + + 'tb_host': args.tb_host, + 'tb_token': args.tb_token, + + 'config_file': args.config_file, 'log_file': args.log_file, 'log_level': args.log_level, @@ -518,7 +581,10 @@ def options(self): 'dst_user': args.dst_user, 'dst_password': args.dst_password, 'dst_schema': args.dst_schema, + 'dst_distribute': args.dst_distribute, + 'dst_cluster': args.dst_cluster, 'dst_table': args.dst_table, + 'dst_table_prefix': args.dst_table_prefix, 'dst_create_table': args.dst_create_table, # @@ -541,8 +607,8 @@ def options(filename): # def transform(section, key): - newkey = key.replace('-', '_') - section.rename(key, newkey) + new_key = key.replace('-', '_') + section.rename(key, new_key) # fetch base config try: @@ -551,7 +617,7 @@ def transform(section, key): encoding="utf-8", default_encoding="utf-8", list_values=True, - create_empty=False, # create empty config file + create_empty=False, # create empty config file stringify=True, raise_errors=False, file_error=False, @@ -566,7 +632,7 @@ def transform(section, key): encoding="utf-8", default_encoding="utf-8", list_values=True, - create_empty=False, # create empty config file + create_empty=False, # create empty config file stringify=True, raise_errors=False, file_error=False, diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index e2041e5..217aa0b 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -1,12 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import logging +import os from clickhouse_mysql.reader.mysqlreader import MySQLReader from clickhouse_mysql.reader.csvreader import CSVReader from clickhouse_mysql.writer.chwriter import CHWriter from clickhouse_mysql.writer.csvwriter import CSVWriter -from clickhouse_mysql.writer.chcsvwriter import CHCSVWriter +from clickhouse_mysql.writer.tbcsvwriter import TBCSVWriter from clickhouse_mysql.writer.poolwriter import PoolWriter from clickhouse_mysql.writer.processwriter import ProcessWriter from clickhouse_mysql.objectbuilder import ObjectBuilder @@ -39,7 +41,7 @@ def __init__(self): log_file = None log_pos = None - if self.options['binlog_position_file'] and self.options.get_bool('src_resume'): + if self.options['binlog_position_file'] and self.options.get_bool('src_resume') and os.path.exists(self.options['binlog_position_file']): try: with open(self.options['binlog_position_file'], 'r') as f: position = f.read() @@ -50,18 +52,22 @@ def __init__(self): log_file, log_pos )) - except: + except Exception as e: log_file = None log_pos = None - print("can't read binlog position from file {}".format( + logging.exception(e) + logging.info("can't read binlog position from file {}".format( self.options['binlog_position_file'], )) - # build application config out of aggregated options self.config = { # # # + 'tinybird': { + 'host': self.options['tb_host'], + 'token': self.options['tb_token'], + }, 'app': { 'config_file': self.options['config_file'], 'log_file': self.options['log_file'], @@ -108,9 +114,10 @@ def __init__(self): 'port': self.options.get_int('src_port'), 'user': self.options['src_user'], 'password': self.options['src_password'], - 'dbs': self.options.get_list('src_schemas'), + 'schemas': self.options.get_list('src_schemas'), 'tables': self.options.get_list('src_tables'), 'tables_prefixes': self.options.get_list('src_tables_prefixes'), + 'column_skip': self.options['column_skip'] }, 'clickhouse': { 'connection_settings': { @@ -119,6 +126,11 @@ def __init__(self): 'user': self.options['dst_user'], 'password': self.options['dst_password'], }, + 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], + 'dst_cluster': self.options['dst_cluster'], + 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, }, @@ -132,10 +144,11 @@ def __init__(self): 'port': self.options.get_int('src_port'), 'user': self.options['src_user'], 'password': self.options['src_password'], - 'dbs': self.options.get_list('src_schemas'), + 'schemas': self.options.get_list('src_schemas'), 'tables': self.options.get_list('src_tables'), 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 'tables_where_clauses': self.options.get_list('src_tables_where_clauses'), + 'column_skip': self.options['column_skip'] }, 'clickhouse': { 'connection_settings': { @@ -145,7 +158,10 @@ def __init__(self): 'password': self.options['dst_password'], }, 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], + 'dst_cluster': self.options['dst_cluster'], 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, }, @@ -189,7 +205,9 @@ def __init__(self): 'password': self.options['dst_password'], }, 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], }, 'file': { 'csv_file_path': self.options['dst_file'], @@ -197,7 +215,9 @@ def __init__(self): 'csv_file_path_suffix_parts': [], 'csv_keep_file': self.options['csvpool_keep_files'], 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], }, }, } @@ -247,9 +267,15 @@ def table_sql_builder(self): port=self.config['table_builder']['mysql']['port'], user=self.config['table_builder']['mysql']['user'], password=self.config['table_builder']['mysql']['password'], - dbs=self.config['table_builder']['mysql']['dbs'], + dbs=self.config['table_builder']['mysql']['schemas'], + dst_schema=self.config['table_builder']['clickhouse']['dst_schema'], + dst_table=self.config['table_builder']['clickhouse']['dst_table'], + dst_table_prefix=self.config['table_builder']['clickhouse']['dst_table_prefix'], + distribute=self.config['table_builder']['clickhouse']['dst_distribute'], + cluster=self.config['table_builder']['clickhouse']['dst_cluster'], tables=self.config['table_builder']['mysql']['tables'], tables_prefixes=self.config['table_builder']['mysql']['tables_prefixes'], + column_skip=self.config['converter']['clickhouse']['column_skip'], ) def is_migrate_table(self): @@ -267,10 +293,16 @@ def table_migrator(self): port=self.config['table_migrator']['mysql']['port'], user=self.config['table_migrator']['mysql']['user'], password=self.config['table_migrator']['mysql']['password'], - dbs=self.config['table_migrator']['mysql']['dbs'], + dbs=self.config['table_migrator']['mysql']['schemas'], + dst_schema=self.config['table_migrator']['clickhouse']['dst_schema'], + dst_table=self.config['table_builder']['clickhouse']['dst_table'], + dst_table_prefix=self.config['table_builder']['clickhouse']['dst_table_prefix'], + distribute=self.config['table_migrator']['clickhouse']['dst_distribute'], + cluster=self.config['table_migrator']['clickhouse']['dst_cluster'], tables=self.config['table_migrator']['mysql']['tables'], tables_prefixes=self.config['table_migrator']['mysql']['tables_prefixes'], tables_where_clauses=self.config['table_migrator']['mysql']['tables_where_clauses'], + column_skip=self.config['converter']['clickhouse']['column_skip'], ) table_migrator.chwriter = self.writer_builder_chwriter().get() table_migrator.chclient = self.chclient() @@ -332,9 +364,14 @@ def writer_builder_csvpool(self): 'csv_keep_file': self.config['writer']['file']['csv_keep_file'], 'dst_schema': self.config['writer']['file']['dst_schema'], 'dst_table': self.config['writer']['file']['dst_table'], + 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 'next_writer_builder': ObjectBuilder( - class_name=CHCSVWriter, - constructor_params=self.config['writer']['clickhouse'] + class_name=TBCSVWriter, + constructor_params={ + 'tb_host': self.config['tinybird']['host'], + 'tb_token': self.config['tinybird']['token'], + 'dst_table': self.config['writer']['clickhouse']['dst_table'] + } ), 'converter_builder': self.converter_builder(CONVERTER_CSV), }) @@ -348,6 +385,7 @@ def writer_builder_csv_file(self): 'csv_keep_file': self.config['writer']['file']['csv_keep_file'], 'dst_schema': self.config['writer']['file']['dst_schema'], 'dst_table': self.config['writer']['file']['dst_table'], + 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 'next_writer_builder': None, 'converter_builder': self.converter_builder(CONVERTER_CSV), }) @@ -362,6 +400,8 @@ def writer_builder_chwriter(self): }, 'dst_schema': self.config['writer']['clickhouse']['dst_schema'], 'dst_table': self.config['writer']['clickhouse']['dst_table'], + 'dst_table_prefix': self.config['writer']['clickhouse']['dst_table_prefix'], + 'dst_distribute': self.config['writer']['clickhouse']['dst_distribute'], 'next_writer_builder': None, 'converter_builder': self.converter_builder(CONVERTER_CH), }) diff --git a/clickhouse_mysql/dbclient/mysqlclient.py b/clickhouse_mysql/dbclient/mysqlclient.py index aac15ce..0073bfe 100644 --- a/clickhouse_mysql/dbclient/mysqlclient.py +++ b/clickhouse_mysql/dbclient/mysqlclient.py @@ -58,21 +58,26 @@ def connect(self, db): try: self.connection = MySQLdb.connect( host=self.host, + port=self.port, user=self.user, passwd=self.password, db=db, cursorclass=self.cursorclass, + charset='utf8', + use_unicode=True, ) self.cursor = self.connection.cursor() - logging.debug("Connect to the database host={} user={} password={} db={}".format( + logging.debug("Connect to the database host={} port={} user={} password={} db={}".format( self.host, + self.port, self.user, self.password, db )) except: - raise Exception("Can not connect to the database host={} user={} password={} db={}".format( + raise Exception("Can not connect to the database host={} port={} user={} password={} db={}".format( self.host, + self.port, self.user, self.password, db @@ -99,13 +104,14 @@ def tables_list(self, db): tables = [] for row in self.cursor: - logging.debug("table: {}".format(row)) - table_name = row['Tables_in_db'] + table_name = row[0] tables.append(table_name) - except: - raise Exception("Can not list tables on host={} user={} password={} db={}".format( + except Exception as err: + logging.debug("Unexpected error: {}".format(str(err))) + raise Exception("Can not list tables on host={} port={} user={} password={} db={}".format( self.host, + self.port, self.user, self.password, db diff --git a/clickhouse_mysql/event/event.py b/clickhouse_mysql/event/event.py index 836f3d2..e38f80b 100644 --- a/clickhouse_mysql/event/event.py +++ b/clickhouse_mysql/event/event.py @@ -28,6 +28,9 @@ class Event(object): # table name table = None + # primary key + primary_key = None + # /path/to/csv/file.csv filename = None @@ -61,7 +64,11 @@ def __next__(self): if self.pymysqlreplication_event is not None: # in native replication event actual data are in row['values'] dict item - return item['values'] + if 'after_values' in item: + return item['after_values'] + else: + return item['values'] + else: # local-kept data return item diff --git a/clickhouse_mysql/main.py b/clickhouse_mysql/main.py index bab32eb..d751573 100644 --- a/clickhouse_mysql/main.py +++ b/clickhouse_mysql/main.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import signal import sys import multiprocessing as mp import logging @@ -145,6 +146,10 @@ def run(self): reader=self.config.reader(), writer=self.config.writer(), ) + + signal.signal(signal.SIGINT, pumper.exit_gracefully) + signal.signal(signal.SIGTERM, pumper.exit_gracefully) + pumper.run() except Exception as ex: @@ -153,6 +158,7 @@ def run(self): traceback.print_exc(file=sys.stdout) print('=============') print(ex) + sys.exit(1); def start(self): if self.config.is_daemon(): diff --git a/clickhouse_mysql/pool/bbpool.py b/clickhouse_mysql/pool/bbpool.py index f15c268..c36265b 100644 --- a/clickhouse_mysql/pool/bbpool.py +++ b/clickhouse_mysql/pool/bbpool.py @@ -6,6 +6,7 @@ from clickhouse_mysql.pool.pool import Pool from clickhouse_mysql.objectbuilder import ObjectBuilder +from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent # Buckets Belts' Index Generator @@ -149,7 +150,18 @@ def rotate_belt(self, belt_index, flush=False): # time to flush data for specified key #self.writer_builder.param('csv_file_path_suffix_parts', [str(int(now)), str(self.buckets_num_total)]) writer = self.writer_builder.new() - writer.insert(self.belts[belt_index].pop()) + item = self.belts[belt_index].pop() + # process event based on its type + if isinstance(item[0].pymysqlreplication_event, WriteRowsEvent): + writer.insert(item) + elif isinstance(item[0].pymysqlreplication_event, DeleteRowsEvent): + writer.delete(item) + elif isinstance(item[0].pymysqlreplication_event, UpdateRowsEvent): + writer.update(item) + else: + # skip other unhandled events + pass + # writer.insert(self.belts[belt_index].pop()) writer.close() writer.push() writer.destroy() diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index 959da6e..245f9c2 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -1,18 +1,21 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import time + +from clickhouse_mysql.reader.reader import Reader +from clickhouse_mysql.writer.writer import Writer +import signal + class Pumper(object): """ Pump data - read data from reader and push into writer """ - reader = None - writer = None + reader: Reader = None + writer: Writer = None def __init__(self, reader=None, writer=None): - self.reader = reader self.writer = writer @@ -20,8 +23,10 @@ def __init__(self, reader=None, writer=None): # subscribe on reader's event notifications self.reader.subscribe({ 'WriteRowsEvent': self.write_rows_event, -# 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, - 'ReaderIdleEvent': self.reader_idle_event, + 'UpdateRowsEvent': self.update_rows_event, + 'DeleteRowsEvent': self.delete_rows_event, + # 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, + # 'ReaderIdleEvent': self.reader_idle_event, }) def run(self): @@ -47,5 +52,23 @@ def reader_idle_event(self): """ self.writer.flush() + def delete_rows_event(self, event=None): + """ + DeleteRowsEvent handler + :param event: + """ + self.writer.delete_row(event) + + def update_rows_event(self, event=None): + """ + UpdateRowsEvent handler + :param event: + """ + self.writer.update(event) + + def exit_gracefully(self, sig, frame): + self.reader.close() + + if __name__ == '__main__': print("pumper") diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 96edbdd..5cb6c5c 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -12,7 +12,6 @@ from clickhouse_mysql.event.event import Event from clickhouse_mysql.tableprocessor import TableProcessor from clickhouse_mysql.util import Util -#from pymysqlreplication.event import QueryEvent, RotateEvent, FormatDescriptionEvent class MySQLReader(Reader): @@ -29,9 +28,10 @@ class MySQLReader(Reader): resume_stream = None binlog_stream = None nice_pause = 0 + exit_gracefully = False write_rows_event_num = 0 - write_rows_event_each_row_num = 0; + write_rows_event_each_row_num = 0 binlog_position_file = None @@ -56,13 +56,15 @@ def __init__( self.server_id = server_id self.log_file = log_file self.log_pos = log_pos - self.schemas = None if not TableProcessor.extract_dbs(schemas, Util.join_lists(tables, tables_prefixes)) else TableProcessor.extract_dbs(schemas, Util.join_lists(tables, tables_prefixes)) + self.schemas = None if not TableProcessor.extract_dbs(schemas, Util.join_lists(tables, + tables_prefixes)) else TableProcessor.extract_dbs( + schemas, Util.join_lists(tables, tables_prefixes)) self.tables = None if tables is None else TableProcessor.extract_tables(tables) self.tables_prefixes = None if tables_prefixes is None else TableProcessor.extract_tables(tables_prefixes) self.blocking = blocking self.resume_stream = resume_stream self.nice_pause = nice_pause - self.binlog_position_file=binlog_position_file + self.binlog_position_file = binlog_position_file logging.info("raw dbs list. len()=%d", 0 if schemas is None else len(schemas)) if schemas is not None: @@ -86,7 +88,8 @@ def __init__( if tables_prefixes is not None: for table in tables_prefixes: logging.info(table) - logging.info("normalised tables-prefixes list. len()=%d", 0 if self.tables_prefixes is None else len(self.tables_prefixes)) + logging.info("normalised tables-prefixes list. len()=%d", + 0 if self.tables_prefixes is None else len(self.tables_prefixes)) if self.tables_prefixes is not None: for table in self.tables_prefixes: logging.info(table) @@ -101,28 +104,28 @@ def __init__( # we are interested in reading CH-repeatable events only only_events=[ # Possible events - #BeginLoadQueryEvent, + # BeginLoadQueryEvent, DeleteRowsEvent, - #ExecuteLoadQueryEvent, - #FormatDescriptionEvent, - #GtidEvent, - #HeartbeatLogEvent, - #IntvarEvent - #NotImplementedEvent, - #QueryEvent, - #RotateEvent, - #StopEvent, - #TableMapEvent, + # ExecuteLoadQueryEvent, + # FormatDescriptionEvent, + # GtidEvent, + # HeartbeatLogEvent, + # IntvarEvent + # NotImplementedEvent, + # QueryEvent, + # RotateEvent, + # StopEvent, + # TableMapEvent, UpdateRowsEvent, WriteRowsEvent, - #XidEvent, + # XidEvent, ], only_schemas=self.schemas, # in case we have any prefixes - this means we need to listen to all tables within specified schemas only_tables=self.tables if not self.tables_prefixes else None, log_file=self.log_file, log_pos=self.log_pos, - freeze_schema=True, # If true do not support ALTER TABLE. It's faster. + freeze_schema=True, # If true do not support ALTER TABLE. It's faster. blocking=False, resume_stream=self.resume_stream, ) @@ -245,6 +248,9 @@ def process_write_rows_event(self, mysql_event): :param mysql_event: WriteRowsEvent instance :return: """ + + logging.debug("Received insert event for table: " + mysql_event.table) + if self.tables_prefixes: # we have prefixes specified # need to find whether current event is produced by table in 'looking-into-tables' list @@ -294,10 +300,81 @@ def process_write_rows_event(self, mysql_event): self.stat_write_rows_event_finalyse() def process_update_rows_event(self, mysql_event): - logging.info("Skip update rows") + + logging.debug("Received update event for table: " + mysql_event.table + " Schema: " + mysql_event.schema) + + # for row in mysql_event.rows: + # for key in row['before_values']: + # logging.debug("\t *%s:%s=>%s" % (key, row["before_values"][key], row["after_values"][key])) + + if self.tables_prefixes: + # we have prefixes specified + # need to find whether current event is produced by table in 'looking-into-tables' list + if not self.is_table_listened(mysql_event.table): + # this table is not listened + # processing is over - just skip event + return + + # statistics + self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) + + if self.subscribers('UpdateRowsEvent'): + # dispatch event to subscribers + + # statistics + # self.stat_write_rows_event_all_rows(mysql_event=mysql_event) + + # dispatch Event + event = Event() + event.schema = mysql_event.schema + event.table = mysql_event.table + event.pymysqlreplication_event = mysql_event + + self.process_first_event(event=event) + self.notify('UpdateRowsEvent', event=event) + + # self.stat_write_rows_event_finalyse() + + # logging.info("Skip update rows") def process_delete_rows_event(self, mysql_event): - logging.info("Skip delete rows") + logging.debug("Received delete event for table: " + mysql_event.table) + + """ + for row in mysql_event.rows: + for key in row['values']: + logging.debug("\t *", key, ":", row["values"][key]) + """ + + if self.tables_prefixes: + # we have prefixes specified + # need to find whether current event is produced by table in 'looking-into-tables' list + if not self.is_table_listened(mysql_event.table): + # this table is not listened + # processing is over - just skip event + return + + # statistics + # self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) + + if self.subscribers('DeleteRowsEvent'): + # dispatch event to subscribers + + # statistics + # self.stat_write_rows_event_all_rows(mysql_event=mysql_event) + + # dispatch Event + event = Event() + event.schema = mysql_event.schema + event.table = mysql_event.table + event.pymysqlreplication_event = mysql_event + + self.process_first_event(event=event) + self.notify('DeleteRowsEvent', event=event) + + # self.stat_write_rows_event_finalyse() + + # logging.info("Skip delete rows") def process_binlog_position(self, file, pos): if self.binlog_position_file: @@ -312,7 +389,7 @@ def read(self): # fetch events try: - while True: + while not self.exit_gracefully: logging.debug('Check events in binlog stream') self.init_fetch_loop() @@ -321,10 +398,17 @@ def read(self): self.stat_init_fetch_loop() try: + logging.debug('Pre-start binlog position: ' + self.binlog_stream.log_file + ":" + str( + self.binlog_stream.log_pos) if self.binlog_stream.log_pos is not None else "undef") + # fetch available events from MySQL for mysql_event in self.binlog_stream: - # new event has come - # check what to do with it + + if self.exit_gracefully: + break + + logging.debug( + 'Got Event ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos)) # process event based on its type if isinstance(mysql_event, WriteRowsEvent): @@ -337,23 +421,19 @@ def read(self): # skip other unhandled events pass - # after event processed, we need to handle current binlog position - self.process_binlog_position(self.binlog_stream.log_file, self.binlog_stream.log_pos) + # after event processed, we need to handle current binlog position + self.process_binlog_position(self.binlog_stream.log_file, self.binlog_stream.log_pos) - except KeyboardInterrupt: - # pass SIGINT further - logging.info("SIGINT received. Pass it further.") - raise except Exception as ex: if self.blocking: # we'd like to continue waiting for data # report and continue cycle logging.warning("Got an exception, skip it in blocking mode") - logging.warning(ex) + logging.exception(ex) else: # do not continue, report error and exit logging.critical("Got an exception, abort it in non-blocking mode") - logging.critical(ex) + logging.exception(ex) sys.exit(1) # all events fetched (or none of them available) @@ -363,25 +443,23 @@ def read(self): if not self.blocking: # do not wait for more data - all done - break # while True + break # while True # blocking - wait for more data if self.nice_pause > 0: time.sleep(self.nice_pause) self.notify('ReaderIdleEvent') - - except KeyboardInterrupt: - logging.info("SIGINT received. Time to exit.") except Exception as ex: logging.warning("Got an exception, handle it") - logging.warning(ex) + logging.exception(ex) try: self.binlog_stream.close() + logging.info("Stop reading from MySQL") except Exception as ex: logging.warning("Unable to close binlog stream correctly") - logging.warning(ex) + logging.exception(ex) end_timestamp = int(time.time()) @@ -389,6 +467,12 @@ def read(self): logging.info('end %d', end_timestamp) logging.info('len %d', end_timestamp - self.start_timestamp) + def close(self): + self.exit_gracefully = True + self.nice_pause = 0 + logging.info("MySQL should stop in the next loop") + + if __name__ == '__main__': connection_settings = { 'host': '127.0.0.1', diff --git a/clickhouse_mysql/reader/reader.py b/clickhouse_mysql/reader/reader.py index 379cf5f..107d04a 100644 --- a/clickhouse_mysql/reader/reader.py +++ b/clickhouse_mysql/reader/reader.py @@ -18,6 +18,13 @@ class Reader(Observable): # called when Reader has no data to read 'ReaderIdleEvent': [], + + # called on each DeleteRowsEvent + 'DeleteRowsEvent': [], + + # called on each UpdateRowsEvent + 'UpdateRowsEvent': [], + } def __init__(self, converter=None, callbacks={}): @@ -26,3 +33,6 @@ def __init__(self, converter=None, callbacks={}): def read(self): pass + + def close(self): + pass diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index b631a0f..07e0986 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -2,8 +2,9 @@ # -*- coding: utf-8 -*- import logging +import os.path -from MySQLdb.cursors import SSDictCursor +from MySQLdb.cursors import SSDictCursor,Cursor from clickhouse_mysql.tableprocessor import TableProcessor from clickhouse_mysql.tablesqlbuilder import TableSQLBuilder from clickhouse_mysql.event.event import Event @@ -37,9 +38,15 @@ def __init__( user=None, password=None, dbs=None, + dst_schema=None, + dst_table=None, + dst_table_prefix=None, + distribute=None, + cluster=None, tables=None, tables_prefixes=None, tables_where_clauses=None, + column_skip=[], ): super().__init__( host=host, @@ -47,8 +54,14 @@ def __init__( user=user, password=password, dbs=dbs, + dst_schema=dst_schema, + dst_table=dst_table, + dst_table_prefix=dst_table_prefix, + distribute=distribute, + cluster=cluster, tables=tables, tables_prefixes=tables_prefixes, + column_skip=column_skip ) self.client.cursorclass = SSDictCursor @@ -92,7 +105,11 @@ def __init__( db, table = TableProcessor.parse_full_table_name(full_table_name) if not db in self.where_clauses: self.where_clauses[db] = {} - self.where_clauses[db][table] = open(where_file_name, 'r').read().strip("\n") + + if os.path.isfile(where_file_name): + self.where_clauses[db][table] = open(where_file_name, 'r').read().strip("\n") + else: + self.where_clauses[db][table] = where_file_name # debug info logging.info("migration where clauses") @@ -167,17 +184,18 @@ def migrate_one_table_data(self, db=None, table=None): :return: number of migrated rows """ - self.client.cursorclass = SSDictCursor - self.client.connect(db=db) # build SQL statement - sql = "SELECT * FROM {0}".format(self.create_full_table_name(db=db, table=table)) + full_table_name = self.create_full_table_name(db=db, table=table) + sql = "SELECT {0} FROM {1}".format(",".join(self.get_columns(db, full_table_name)), full_table_name) # in case we have WHERE clause for this db.table - add it to SQL if db in self.where_clauses and table in self.where_clauses[db]: sql += " WHERE {}".format(self.where_clauses[db][table]) try: logging.info("migrate_table. sql={}".format(sql)) + self.client.cursorclass = SSDictCursor + self.client.connect(db=db) self.client.cursor.execute(sql) cnt = 0; while True: @@ -195,18 +213,27 @@ def migrate_one_table_data(self, db=None, table=None): self.chwriter.flush() cnt += len(rows) - except: - raise Exception("Can not migrate table on host={} user={} password={} db={} table={} cnt={}".format( - self.host, - self.user, - self.password, + except Exception as ex: + logging.critical("Critical error: {}".format(str(ex))) + raise Exception("Can not migrate table on db={} table={}".format( db, table, - cnt )) return cnt + def get_columns(self,db,full_table_name): + self.client.cursorclass = Cursor + self.client.connect(db=db) + self.client.cursor.execute("DESC {}".format(full_table_name)) + fields = [] + for (_field, _type, _null, _key, _default, _extra,) in self.client.cursor: + if self.column_skip.__contains__(_field): + logging.debug("skip column %s",_field) + continue + fields.append('`{}`'.format(_field)) + + return fields if __name__ == '__main__': tb = TableBuilder( diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index 18aa60d..7bb96b8 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -25,15 +25,21 @@ def __init__( user=None, password=None, dbs=None, + dst_schema=None, + dst_table=None, + dst_table_prefix=None, + distribute=None, + cluster=None, tables=None, tables_prefixes=None, + column_skip=[], ): """ :param host: string MySQL host :param port: int MySQL port :param user: string MySQL user :param password: string MySQL password - :param dbs: list of string MySQL datatabse. May be omitted, in this case tables has to contain full table names, Ex.: db.table1 + :param dbs: list of string MySQL databases. May be omitted, in this case tables has to contain full table names, Ex.: db.table1 :param tables: list of string list of table names. Table names may be short or full form :param tables_prefixes: list of string list of table prefixes. May be short or full form """ @@ -46,12 +52,18 @@ def __init__( 'user': user, 'password': password, }) + self.dst_schema = dst_schema + self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix + self.cluster = cluster + self.distribute = distribute + self.column_skip = column_skip def dbs_tables_lists(self): """ Prepare dict of databases and with list of tables for each db Include all tables into db tables list in case to tables are explicitly specified - It still can be no tables - incase db really has no tables + It still can be no tables - in case db really has no tables For convenient iteration over all tables :return: @@ -131,15 +143,41 @@ def tables_match(self, db, prefix): return res @staticmethod - def create_full_table_name(db=None, table=None): + def create_full_table_name(dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, distribute=None): """ - Create fully-specified table name as `db`.`table` or just `table` + Create fully-specified table name as `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` + :param dst_schema: :param db: :param table: - :return: `db`.`table` or just `table` + :param distribute: + :return: `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` """ - return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) + + # target table can be renamed with dst_table + table = dst_table if dst_table is not None else table + + # simple case - do not move table into another db + if dst_schema is None: + return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) + + if distribute: + dst_schema += "_all" + table += "_all" + + return \ + '`{0}`.`{1}`'.format(dst_schema, TableProcessor.create_migrated_table_name(prefix=dst_table_prefix, table=table)) \ + if db else \ + '`{0}`'.format(table) + + @staticmethod + def create_migrated_table_name(prefix=None, table=None): + prefix = prefix if prefix is not None else "" + return prefix + table + + @staticmethod + def create_distributed_table_name(db=None, table=None): + return db + "__" + table + "_all" @staticmethod def is_full_table_name(full_name): @@ -265,7 +303,9 @@ def extract_dbs(dbs=[], tables=[]): :param tables: list of tables with (otional) full names :return: set of db names """ - dbs_group = TableProcessor.group_tables(dbs=dbs, tables=tables, unsettled_tables_action=TableProcessor.ACTION_IGNORE_TABLE) + dbs_group = TableProcessor.group_tables(dbs=dbs, + tables=tables, + unsettled_tables_action=TableProcessor.ACTION_IGNORE_TABLE) return dbs_group.keys() @@ -276,7 +316,8 @@ def extract_tables(tables=[]): :param tables: list of (possibly) full names :return: set of short names """ - dbs_group = TableProcessor.group_tables(tables=tables, unsettled_tables_action=TableProcessor.ACTION_INCLUDE_TABLE) + dbs_group = TableProcessor.group_tables(tables=tables, + unsettled_tables_action=TableProcessor.ACTION_INCLUDE_TABLE) res = set() for db in dbs_group: res.update(dbs_group[db]) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 6fe4db6..77d10f2 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -3,6 +3,8 @@ from clickhouse_mysql.tableprocessor import TableProcessor from MySQLdb.cursors import Cursor +import logging + class TableSQLBuilder(TableProcessor): """ @@ -16,15 +18,16 @@ def templates(self): :return: dict of ClickHouse's CREATE TABLE () templates { 'db1': { - 'table1': CREATE TABLE TABLE1 TEMPLATE, + 'table-db1-1': CREATE TABLE table1 statement template, + 'table-db1-2': CREATE TABLE table2 statement template, }, 'db2': { - 'table2': CREATE TABLE TABLE2 TEMPLATE, + 'table-db2-1': CREATE TABLE table1 statement template, + 'table-db2-2': CREATE TABLE table2 statement template, } } """ dbs = self.dbs_tables_lists() - if dbs is None: return None @@ -32,38 +35,57 @@ def templates(self): for db in dbs: templates[db] = {} for table in dbs[db]: - templates[db][table] = self.create_table_description(db=db, table=table) + templates[db][table] = self.create_table_description( + cluster=self.cluster, + dst_schema=self.dst_schema, + dst_table=self.dst_table, + dst_table_prefix=self.dst_table_prefix, + db=db, + table=table) return templates - def create_table_description(self, db=None, table=None): + def create_table_description(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None): """ High-level function. Produce either text ClickHouse's table SQL CREATE TABLE() template or JSON ClikcHouse's table description :param db: string MySQL db name :param table: string MySQL table name - :param json: bool what shold return - json description or ClickHouse's SQL template + :param json: bool what should return - json description or ClickHouse's SQL template :return: dict{"template":SQL, "fields": {}} or string SQL """ columns_description = self.create_table_columns_description(db=db, table=table) return { - "create_table_template": self.create_table_sql_template(db=db, table=table, columns_description=columns_description), - "create_table": self.create_table_sql(db=db, table=table, columns_description=columns_description), - "create_database": self.create_database_sql(db=db), + "create_table_template": self.create_table_sql_template(cluster=cluster, + dst_schema=dst_schema, + dst_table=dst_table, + dst_table_prefix=dst_table_prefix, + db=db, + table=table, + columns_description=columns_description), + "create_table": self.create_table_sql(cluster=cluster, + dst_schema=dst_schema, + dst_table=dst_table, + dst_table_prefix=dst_table_prefix, + db=db, + table=table, + columns_description=columns_description), + "create_database": self.create_database_sql(dst_schema=dst_schema, db=db), "fields": columns_description, } - def create_table_sql_template(self, db=None, table=None, columns_description=None): + def create_table_sql_template(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse - CREATE TABLE( + CREATE TABLE schema.table ( ... columns specification ... ) ENGINE = MergeTree(_, (), 8192) for specified MySQL's table - :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template + :param db: string - name of the DB in MySQL + :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template :return: string - almost-ready-to-use ClickHouse CREATE TABLE statement """ @@ -71,16 +93,18 @@ def create_table_sql_template(self, db=None, table=None, columns_description=Non for column_description in columns_description: ch_columns.append('`{}` {}'.format(column_description['field'], column_description['clickhouse_type_nullable'])) - sql = """CREATE TABLE IF NOT EXISTS {} ( + sql = """CREATE TABLE IF NOT EXISTS {} {} ( {} -) ENGINE = MergeTree(, (), 8192) +) +ENGINE = MergeTree(, (), 8192) """.format( - self.create_full_table_name(db=db, table=table), - ",\n ".join(ch_columns) + self.create_full_table_name(dst_schema=dst_schema, dst_table=dst_table, dst_table_prefix=dst_table_prefix, db=db, table=table), + "on cluster {}".format(cluster) if cluster is not None else "", + ",\n ".join(ch_columns), ) return sql - def create_table_sql(self, db=None, table=None, columns_description=None): + def create_table_sql(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse CREATE TABLE( @@ -89,8 +113,9 @@ def create_table_sql(self, db=None, table=None, columns_description=None): ... ) ENGINE = MergeTree(PRIMARY DATE FIELD, (COMMA SEPARATED INDEX FIELDS LIST), 8192) for specified MySQL's table - :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template + :param db: string - name of the DB in MySQL + :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template :return: string - ready-to-use ClickHouse CREATE TABLE statement """ @@ -99,10 +124,10 @@ def create_table_sql(self, db=None, table=None, columns_description=None): primary_date_field = self.fetch_primary_date_field(columns_description) primary_key_fields = self.fetch_primary_key_fields(columns_description) - if primary_date_field is None: - # No primary date field found. Make one - primary_date_field = 'primary_date_field' - ch_columns.append('`primary_date_field` Date default today()') + # if primary_date_field is None: + # # No primary date field found. Make one + # primary_date_field = 'primary_date_field' + # ch_columns.append('`primary_date_field` Date default today()') if primary_key_fields is None: # No primary key fields found. Make PK from primary date field @@ -115,26 +140,33 @@ def create_table_sql(self, db=None, table=None, columns_description=None): ch_type = column_description['clickhouse_type'] if (field == primary_date_field) or (field in primary_key_fields) else column_description['clickhouse_type_nullable'] ch_columns.append('`{}` {}'.format(field, ch_type)) - sql = """CREATE TABLE IF NOT EXISTS {} ( + sql = """CREATE TABLE IF NOT EXISTS {} {} ( {} -) ENGINE = MergeTree({}, ({}), 8192) +) +{} """.format( - self.create_full_table_name(db=db, table=table), + self.create_full_table_name(dst_schema=dst_schema, dst_table=dst_table, dst_table_prefix=dst_table_prefix, db=db, table=table, distribute=self.distribute), + "on cluster {}".format(cluster) if not self.distribute and cluster is not None else "", ",\n ".join(ch_columns), - primary_date_field, - ",".join(primary_key_fields), + self.create_table_engine(self.cluster, + self.dst_schema, + self.create_migrated_table_name(prefix=dst_table_prefix, table=dst_table) if dst_table is not None else self.create_migrated_table_name(prefix=dst_table_prefix, table=table), + primary_date_field, + ",".join(primary_key_fields), + self.distribute) ) return sql - def create_database_sql(self, db): + def create_database_sql(self, dst_schema=None, db=None): """ Produce create database statement for ClickHouse CREATE DATABASE for specified MySQL's db - :param db: string - name of the DB in MySQL + + :param db: string - name of the DB :return: string - ready-to-use ClickHouse CREATE DATABASE statement """ - sql = "CREATE DATABASE IF NOT EXISTS `{}`".format(db) + sql = "CREATE DATABASE IF NOT EXISTS `{}`".format(dst_schema if dst_schema is not None else db) return sql def create_table_columns_description(self, db=None, table=None, ): @@ -163,6 +195,9 @@ def create_table_columns_description(self, db=None, table=None, ): # build ready-to-sql column specification Ex.: # `integer_1` Nullable(Int32) # `u_integer_1` Nullable(UInt32) + if self.column_skip.__contains__(_field): + logging.debug("table sql builder skip column %s",_field) + continue columns_description.append({ 'field': _field, 'mysql_type': _type, @@ -183,7 +218,9 @@ def fetch_primary_date_field(self, columns_description): :return: string|None """ for column_description in columns_description: - if (column_description['clickhouse_type'] == 'Date'): + if column_description['clickhouse_type'] == 'Date': + return column_description['field'] + if column_description['clickhouse_type'] == 'DateTime': return column_description['field'] return None @@ -329,6 +366,36 @@ def map_type_nullable(self, mysql_type, nullable=False): return ch_type + def create_table_engine(self, + cluster=None, + dst_schema=None, + dst_table=None, + primary_date_field=None, + primary_key_fields=None, + distribute=None): + """ + :param cluster: + :param dst_schema: + :param dst_table: + :param primary_date_field: + :param primary_key_fields: + :param distribute: + :return: + """ + if distribute: + return "ENGINE = Distributed({}, '{}', '{}', rand())".format( + cluster, + dst_schema, + dst_table + ) + else: + engine = "ENGINE = ReplacingMergeTree() " + if primary_date_field is not None: + engine += "PARTITION BY toYYYYMM({}) ".format(primary_date_field) + if primary_key_fields is not None: + engine += "ORDER BY ({})".format(primary_key_fields) + return engine + if __name__ == '__main__': tb = TableSQLBuilder( host='127.0.0.1', diff --git a/clickhouse_mysql/writer/chcsvwriter.py b/clickhouse_mysql/writer/chcsvwriter.py deleted file mode 100644 index 605544e..0000000 --- a/clickhouse_mysql/writer/chcsvwriter.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import os -import time -import logging - -from clickhouse_mysql.writer.writer import Writer - - -class CHCSVWriter(Writer): - """Write into ClickHouse via CSV file and clickhouse-client tool""" - - dst_schema = None - dst_table = None - - host = None - port = None - user = None - password = None - - def __init__( - self, - connection_settings, - dst_schema=None, - dst_table=None, - ): - logging.info("CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, dst_table)) - self.host = connection_settings['host'] - self.port = connection_settings['port'] - self.user = connection_settings['user'] - self.password = connection_settings['password'] - self.dst_schema = dst_schema - self.dst_table = dst_table - - def insert(self, event_or_events=None): - # event_or_events = [ - # event: { - # row: {'id': 3, 'a': 3} - # }, - # event: { - # row: {'id': 3, 'a': 3} - # }, - # ] - - events = self.listify(event_or_events) - if len(events) < 1: - logging.warning('No events to insert. class: %s', __class__) - return - - # assume we have at least one Event - - logging.debug('class:%s insert %d rows', __class__, len(events)) - - for event in events: - schema = self.dst_schema if self.dst_schema else event.schema - table = self.dst_table if self.dst_table else event.table - - sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( - schema, - table, - ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), - ) - - choptions = "" - if self.host: - choptions += " --host=" + self.host - if self.port: - choptions += " --port=" + str(self.port) - if self.user: - choptions += " --user=" + self.user - if self.password: - choptions += " --password=" + self.password - bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( - event.filename, - choptions, - sql, - ) - - logging.info('starting clickhouse-client process') - logging.debug('starting %s', bash) - os.system(bash) - - pass diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 96d8030..c43ec42 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -4,10 +4,13 @@ import logging import sys +from decimal import Decimal + from clickhouse_mysql.dbclient.chclient import CHClient from clickhouse_mysql.writer.writer import Writer -from clickhouse_mysql.event.event import Event +from clickhouse_mysql.tableprocessor import TableProcessor +import datetime class CHWriter(Writer): @@ -16,19 +19,29 @@ class CHWriter(Writer): client = None dst_schema = None dst_table = None + dst_distribute = None def __init__( self, connection_settings, dst_schema=None, dst_table=None, + dst_table_prefix=None, + dst_distribute=False, next_writer_builder=None, converter_builder=None, ): - logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, dst_table)) + if dst_distribute and dst_schema is not None: + dst_schema += "_all" + if dst_distribute and dst_table is not None: + dst_table += "_all" + logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={} dst_distribute={}".format( + connection_settings, dst_schema, dst_table, dst_distribute)) self.client = CHClient(connection_settings) self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix + self.dst_distribute = dst_distribute def insert(self, event_or_events=None): # event_or_events = [ @@ -55,11 +68,25 @@ def insert(self, event_or_events=None): event_converted = None for event in events: if not event.verify: - logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) - continue # for event + logging.warning( + 'Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) + continue # for event event_converted = self.convert(event) for row in event_converted: + # These columns are added to identify the last change (tb_upd) and the kind of operation performed + # 0 - INSERT, 1 - UPDATE, 2 - DELETE + row['tb_upd'] = datetime.datetime.now() + row['operation'] = 0 + + for key in row.keys(): + # we need to convert Decimal or timedelta value to str value for suitable for table structure + if type(row[key]) == [Decimal, datetime.timedelta]: + row[key] = str(row[key]) + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + # row['tb_upd'] = datetime.datetime.now() + # row['operation'] = 0 rows.append(row) logging.debug('class:%s insert %d row(s)', __class__, len(rows)) @@ -67,8 +94,18 @@ def insert(self, event_or_events=None): # determine target schema.table schema = self.dst_schema if self.dst_schema else event_converted.schema - table = self.dst_table if self.dst_table else event_converted.table - logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name( + db=event_converted.schema, table=event_converted.table) + else: + table = self.dst_table if self.dst_table else event_converted.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name( + prefix=self.dst_table_prefix, table=table) + + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format( + schema, table, self.dst_schema, self.dst_table)) # and INSERT converted rows @@ -79,16 +116,226 @@ def insert(self, event_or_events=None): table, ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) ) + logging.debug(f"CHWRITER QUERY INSERT: {sql}") + self.client.execute(sql, rows) + except Exception as ex: + logging.critical('QUERY FAILED') + logging.critical('ex={}'.format(ex)) + logging.critical('sql={}'.format(sql)) + logging.critical('data={}'.format(rows)) + # sys.exit(0) + + # all DONE + + def delete_row(self, event_or_events): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + logging.debug("Delete CHWriter") + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to insert. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s delete %d event(s)', __class__, len(events)) + + # verify and converts events and consolidate converted rows from all events into one batch + + rows = [] + event_converted = None + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), + __class__) + continue # for event + + event_converted = self.convert(event) + for row in event_converted: + # These columns are added to identify the last change (tb_upd) and the kind of operation performed + # 0 - INSERT, 1 - UPDATE, 2 - DELETE + row['tb_upd'] = datetime.datetime.now() + row['operation'] = 2 + + for key in row.keys(): + # we need to convert Decimal or timedelta value to str value for suitable for table structure + if type(row[key]) in [Decimal, datetime.timedelta]: + row[key] = str(row[key]) + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + # row['tb_upd'] = datetime.datetime.now() + # row['operation'] = 2 + rows.append(row) + + logging.debug('class:%s delete %d row(s)', __class__, len(rows)) + + # determine target schema.table + + schema = self.dst_schema if self.dst_schema else event_converted.schema + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name( + db=event_converted.schema, table=event_converted.table) + else: + table = self.dst_table if self.dst_table else event_converted.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name( + prefix=self.dst_table_prefix, table=table) + + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, + self.dst_table)) + + # and DELETE converted rows + + # These columns are added to identify the last change (tb_upd) and the kind of operation performed + # 0 - INSERT, 1 - UPDATE, 2 - DELETE + rows[0]['tb_upd'] = datetime.datetime.now() + rows[0]['operation'] = 2 + + sql = '' + try: + sql = 'INSERT INTO `{0}`.`{1}` ({2}) VALUES'.format( + schema, + table, + ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) + ) + logging.debug(f"CHWRITER QUERY DELETE: {sql}") self.client.execute(sql, rows) + + # sql = '' + # try: + # sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2}'.format( + # schema, + # table, + # ' and '.join(filter(None, map( + # lambda column, value: "" if column != pk else self.get_data_format(column, value), + # row.keys(), row.values()))) + # ) + # + # self.client.execute(sql) + except Exception as ex: logging.critical('QUERY FAILED') logging.critical('ex={}'.format(ex)) logging.critical('sql={}'.format(sql)) - logging.critical('rows={}'.format(rows)) - sys.exit(0) + # sys.exit(0) # all DONE + """ + Get string format pattern for update and delete operations + """ + + def get_data_format(self, column, value): + t = type(value) + if t == str: + return "`%s`='%s'" % (column, value.replace("'", "\\'")) + elif t is datetime.datetime: + return "`%s`='%s'" % (column, value) + else: + # int, float + return "`%s`=%s" % (column, value) + + def update(self, event_or_events): + # event_or_events = [ + # event: { + # row: { + # 'before_values': {'id': 3, 'a': 3}, + # 'after_values': {'id': 3, 'a': 2} + # } + # }, + # event: { + # row: { + # 'before_values': {'id': 2, 'a': 3}, + # 'after_values': {'id': 2, 'a': 2} + # } + # }, + # ] + + logging.debug("Update CHWriter") + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to update. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s update %d event(s)', __class__, len(events)) + + # verify and converts events and consolidate converted rows from all events into one batch + + rows = [] + event_converted = None + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), + __class__) + continue # for event + + event_converted = self.convert(event) + for row in event_converted.pymysqlreplication_event.rows: + + for key in row['after_values'].keys(): + # we need to convert Decimal or timedelta value to str value for suitable for table structure + if type(row['after_values'][key]) in [Decimal, datetime.timedelta]: + row['after_values'][key] = str( + row['after_values'][key]) + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + row['after_values']['tb_upd'] = datetime.datetime.now() + row['after_values']['operation'] = 1 + rows.append(row['after_values']) + + logging.debug('class:%s update %d row(s)', __class__, len(rows)) + + # determine target schema.table + + schema = self.dst_schema if self.dst_schema else event_converted.schema + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name( + db=event_converted.schema, table=event_converted.table) + else: + table = self.dst_table if self.dst_table else event_converted.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name( + prefix=self.dst_table_prefix, table=table) + + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, + self.dst_table)) + + # and UPDATE converted rows + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + rows[0]['tb_upd'] = datetime.datetime.now() + rows[0]['operation'] = 1 + + sql = '' + try: + sql = 'INSERT INTO `{0}`.`{1}` ({2}) VALUES'.format( + schema, + table, + ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) + ) + logging.debug(f"CHWRITER QUERY UPDATE: {sql}") + self.client.execute(sql, rows) + except Exception as ex: + logging.critical('QUERY FAILED') + logging.critical('ex={}'.format(ex)) + logging.critical('sql={}'.format(sql)) + logging.critical('data={}'.format(rows)) + # sys.exit(0) + + # all DONE if __name__ == '__main__': diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 10ec143..34bd096 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -11,6 +11,10 @@ from clickhouse_mysql.writer.writer import Writer from clickhouse_mysql.event.event import Event +import datetime + +from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent + class CSVWriter(Writer): """Write CSV files""" @@ -34,16 +38,17 @@ def __init__( csv_keep_file=False, dst_schema=None, dst_table=None, + dst_table_prefix=None, next_writer_builder=None, converter_builder=None, ): logging.info("CSVWriter() " - "csv_file_path={} " - "csv_file_path_prefix={} " - "csv_file_path_suffix_parts={} " - "csv_keep_file={} " - "dst_schema={} " - "dst_table={} ".format( + "csv_file_path={} " + "csv_file_path_prefix={} " + "csv_file_path_suffix_parts={} " + "csv_keep_file={} " + "dst_schema={} " + "dst_table={} ".format( csv_file_path, csv_file_path_prefix, csv_file_path_suffix_parts, @@ -58,6 +63,7 @@ def __init__( self.path_suffix_parts = csv_file_path_suffix_parts self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix if self.path is None: if not self.path_suffix_parts: @@ -87,6 +93,7 @@ def open(self): # open file for write-at-the-end mode self.file = open(self.path, 'a+') + def insert(self, event_or_events): # event_or_events = [ # event: { @@ -116,13 +123,74 @@ def insert(self, event_or_events): logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) return - self.fieldnames = sorted(self.convert(copy.copy(event.first_row())).keys()) + event_converted = self.convert(event) + rows = event_converted.pymysqlreplication_event.rows + headers = list(rows[0]['values'].keys()) + headers.append('operation') + headers.append('tb_upd') + + # self.fieldnames = sorted(self.convert(copy.copy(event.first_row())).keys()) + self.fieldnames = headers + if self.dst_schema is None: + self.dst_schema = event.schema + if self.dst_table is None: + self.dst_table = event.table + + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_MINIMAL) + if not self.header_written: + self.writer.writeheader() + + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) + continue # for event + self.generate_row(event) + + def delete_row(self, event_or_events): + + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + logging.debug("Delete CSV Writer") + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to delete. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s delete %d events', __class__, len(events)) + + if not self.opened(): + self.open() + + if not self.writer: + # pick any event from the list + event = events[0] + if not event.verify: + logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) + return + + event_converted = self.convert(event) + rows = event_converted.pymysqlreplication_event.rows + headers = list(rows[0]['values'].keys()) + headers.append('operation') + headers.append('tb_upd') + + self.fieldnames = headers if self.dst_schema is None: self.dst_schema = event.schema if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_MINIMAL) if not self.header_written: self.writer.writeheader() @@ -130,11 +198,96 @@ def insert(self, event_or_events): if not event.verify: logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) continue # for event + self.generate_row(event) + + + + def update(self, event_or_events): + + # event_or_events = [ + # event: { + # row: { + # 'before_values': {'id': 3, 'a': 3}, + # 'after_values': {'id': 3, 'a': 2} + # } + # }, + # event: { + # row: { + # 'before_values': {'id': 2, 'a': 3}, + # 'after_values': {'id': 2, 'a': 2} + # } + # }, + # ] + + logging.debug("Update CSV Writer") + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to update. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s updated %d events', __class__, len(events)) + + if not self.opened(): + self.open() + + if not self.writer: + # pick any event from the list + event = events[0] + if not event.verify: + logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) + return + + event_converted = self.convert(event) + rows = event_converted.pymysqlreplication_event.rows + headers = list(rows[0]['after_values'].keys()) + headers.append('operation') + headers.append('tb_upd') + + # self.fieldnames = sorted(headers) + self.fieldnames = headers + if self.dst_schema is None: + self.dst_schema = event.schema + if self.dst_table is None: + self.dst_table = event.table + + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_MINIMAL) + if not self.header_written: + self.writer.writeheader() + + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) + continue # for event + + event_converted = self.convert(event) + self.generate_row(event_converted) + + + def generate_row(self, event): + """ When using mempool or csvpool events are cached so you can receive different kind of events in the same list. These events should be handled in a different way """ + + if isinstance(event.pymysqlreplication_event, WriteRowsEvent): for row in event: + row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + row['operation'] = 0 self.writer.writerow(self.convert(row)) + elif isinstance(event.pymysqlreplication_event, DeleteRowsEvent): + for row in event: + row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + row['operation'] = 2 + self.writer.writerow(self.convert(row)) + elif isinstance(event.pymysqlreplication_event, UpdateRowsEvent): + for row in event.pymysqlreplication_event.rows: + row['after_values']['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + row['after_values']['operation'] = 1 + self.writer.writerow(self.convert(row['after_values'])) + def push(self): - if not self.next_writer_builder: + if not self.next_writer_builder or not self.fieldnames: return event = Event() diff --git a/clickhouse_mysql/writer/poolwriter.py b/clickhouse_mysql/writer/poolwriter.py index b49e011..303ed84 100644 --- a/clickhouse_mysql/writer/poolwriter.py +++ b/clickhouse_mysql/writer/poolwriter.py @@ -37,9 +37,26 @@ def insert(self, event_or_events): logging.debug('class:%s insert', __class__) self.pool.insert(event_or_events) + # TODO delete if delete_row works + def delete(self, event_or_events): + """Insert delete data into Pool""" + logging.debug('class:%s delete', __class__) + self.pool.insert(event_or_events) + + def delete_row(self, event_or_events): + """Insert delete data into Pool""" + logging.debug('class:%s delete', __class__) + self.pool.insert(event_or_events) + + def update(self, event_or_events): + """Insert update data into Pool""" + logging.debug('class:%s update', __class__) + self.pool.insert(event_or_events) + def flush(self): self.pool.flush() + if __name__ == '__main__': path = 'file.csv' diff --git a/clickhouse_mysql/writer/processwriter.py b/clickhouse_mysql/writer/processwriter.py index 226b72b..b3584f2 100644 --- a/clickhouse_mysql/writer/processwriter.py +++ b/clickhouse_mysql/writer/processwriter.py @@ -35,6 +35,28 @@ def process(self, event_or_events=None): writer.destroy() logging.debug('class:%s process() done', __class__) + def processDelete(self, event_or_events=None): + """Separate process body to be run""" + + logging.debug('class:%s process()', __class__) + writer = self.next_writer_builder.get() + writer.delete_row(event_or_events) + writer.close() + writer.push() + writer.destroy() + logging.debug('class:%s processDelete() done', __class__) + + def processUpdate(self, event_or_events=None): + """Separate process body to be run""" + + logging.debug('class:%s process()', __class__) + writer = self.next_writer_builder.get() + writer.update(event_or_events) + writer.close() + writer.push() + writer.destroy() + logging.debug('class:%s processUpdate() done', __class__) + def insert(self, event_or_events=None): # event_or_events = [ # event: { @@ -57,6 +79,50 @@ def insert(self, event_or_events=None): logging.debug('class:%s insert done', __class__) pass + def delete(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # start separated process with event_or_events to be inserted + + logging.debug('class:%s delete', __class__) + process = mp.Process(target=self.processDelete, args=(event_or_events,)) + + logging.debug('class:%s delete.process.start()', __class__) + process.start() + + #process.join() + logging.debug('class:%s delete done', __class__) + pass + + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # start separated process with event_or_events to be inserted + + logging.debug('class:%s update', __class__) + process = mp.Process(target=self.processUpdate, args=(event_or_events,)) + + logging.debug('class:%s update.process.start()', __class__) + process.start() + + #process.join() + logging.debug('class:%s update done', __class__) + pass + def flush(self): pass diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py new file mode 100644 index 0000000..9684f25 --- /dev/null +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging +import time + +from clickhouse_mysql.writer.writer import Writer + +import requests +from requests_toolbelt.multipart.encoder import MultipartEncoder +import json + + +class TBCSVWriter(Writer): + """Write into Tinybird via CSV file""" + + dst_schema = None + dst_table = None + dst_distribute = None + + tb_host = None + tb_token = None + + def __init__( + self, + tb_host, + tb_token, + dst_schema=None, + dst_table=None, + dst_table_prefix=None, + dst_distribute=False, + ): + # if dst_distribute and dst_schema is not None: + # dst_schema += "_all" + # if dst_distribute and dst_table is not None: + # dst_table += "_all" + # logging.info( + # "CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, + # dst_table)) + self.tb_host = tb_host + self.tb_token = tb_token + + if self.tb_host is None or self.tb_token is None: + logging.critical( + f" Host: {self.tb_host} or token {self.tb_token} is missing") + return None + + self.dst_schema = dst_schema + self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix + self.dst_distribute = dst_distribute + + def uploadCSV(self, table, filename, tries=1): + limit_of_retries = 3 + params = { + 'name': table, + 'mode': 'append' + } + + try: + with open(filename, 'rb') as f: + m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) + url = f"{self.tb_host}/v0/datasources" + + response = requests.post( + url, + data=m, + headers={ + 'Authorization': 'Bearer ' + self.tb_token, + 'Content-Type': m.content_type + }, + params=params, + verify=False) + + # logging.debug(response.text) + logging.info(response.json()) + if response.status_code == 200: + json_object = json.loads(response.content) + logging.debug(f"Import id: {json_object['import_id']}") + elif response.status_code == 429: + retry_after = int(response.headers['Retry-After']) + tries + logging.error( + f"Too many requests retrying in {retry_after} seconds to upload {filename } to {table}") + time.sleep(retry_after) + self.uploadCSV(table, filename, tries + 1) + else: + # In case of error let's retry only + logging.exception(response.json()) + time.sleep(tries) + logging.info(f"Retrying { tries } of { limit_of_retries }") + if tries > limit_of_retries: + return + self.uploadCSV(table, filename, tries + 1) + except Exception as e: + logging.exception(e) + # We wait tries^2 sec to try again + time.sleep(tries * tries) + logging.info(f"Retrying { tries } of { limit_of_retries }") + if tries > limit_of_retries: + return + self.uploadCSV(table, filename, tries + 1) + + def insert(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to insert. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s insert %d rows', __class__, len(events)) + + for event in events: + #schema = self.dst_schema if self.dst_schema else event.schema + table = self.dst_table if self.dst_table else event.table + self.uploadCSV(table, event.filename) + + pass + + def deleteRow(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # events = self.listify(event_or_events) + # if len(events) < 1: + # logging.warning('No events to delete. class: %s', __class__) + # return + + # # assume we have at least one Event + + # logging.debug('class:%s delete %d rows', __class__, len(events)) + + # for event in events: + # schema = self.dst_schema if self.dst_schema else event.schema + # table = None + # if self.dst_distribute: + # table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) + # else: + # table = self.dst_table if self.dst_table else event.table + # if self.dst_schema: + # table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + # sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( + # schema, + # table, + # ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + # ) + + # choptions = "" + # if self.host: + # choptions += " --host=" + shlex.quote(self.host) + # if self.port: + # choptions += " --port=" + str(self.port) + # if self.user: + # choptions += " --user=" + shlex.quote(self.user) + # if self.password: + # choptions += " --password=" + shlex.quote(self.password) + # bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( + # event.filename, + # choptions, + # sql, + # ) + + # logging.info('starting clickhouse-client process for delete operation') + # logging.debug('starting %s', bash) + # os.system(bash) + + logging.debug("CHCSVWriter: delete row") + pass + + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # logging.info('starting clickhouse-client process for update operation') + + # events = self.listify(event_or_events) + # if len(events) < 1: + # logging.warning('No events to update. class: %s', __class__) + # return + + # # assume we have at least one Event + + # logging.debug('class:%s update %d rows', __class__, len(events)) + + # for event in events: + # schema = self.dst_schema if self.dst_schema else event.schema + # table = None + # if self.dst_distribute: + # table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) + # else: + # table = self.dst_table if self.dst_table else event.table + # if self.dst_schema: + # table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + # sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( + # schema, + # table, + # ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + # ) + + # sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {3}'.format( + # schema, + # table, + # ', '.join(map(lambda column, value: '`%s`=`%s' % column, event.fieldnames, event.fieldnames)) + # ) + + # choptions = "" + # if self.host: + # choptions += " --host=" + shlex.quote(self.host) + # if self.port: + # choptions += " --port=" + str(self.port) + # if self.user: + # choptions += " --user=" + shlex.quote(self.user) + # if self.password: + # choptions += " --password=" + shlex.quote(self.password) + # bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( + # event.filename, + # choptions, + # sql, + # ) + + # logging.info('starting clickhouse-client process') + # logging.debug('starting %s', bash) + # os.system(bash) + + logging.debug("CHCSVWriter: delete row") + + pass diff --git a/clickhouse_mysql/writer/writer.py b/clickhouse_mysql/writer/writer.py index 11f788c..1bfaeb0 100644 --- a/clickhouse_mysql/writer/writer.py +++ b/clickhouse_mysql/writer/writer.py @@ -55,6 +55,34 @@ def insert(self, event_or_events=None): # ] pass + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: { + # 'before_values': {'id': 3, 'a': 3}, + # 'after_values': {'id': 3, 'a': 2} + # } + # }, + # event: { + # row: { + # 'before_values': {'id': 2, 'a': 3}, + # 'after_values': {'id': 2, 'a': 2} + # } + # }, + # ] + pass + + def delete_row(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + pass + def flush(self): pass diff --git a/db.log_201801_1.sql b/db.log_201801_1.sql deleted file mode 100644 index d578dfe..0000000 --- a/db.log_201801_1.sql +++ /dev/null @@ -1 +0,0 @@ -id < 1727831 diff --git a/db.log_201801_2.sql b/db.log_201801_2.sql deleted file mode 100644 index b4d0459..0000000 --- a/db.log_201801_2.sql +++ /dev/null @@ -1 +0,0 @@ -id < 1727834 diff --git a/db.log_201801_3.sql b/db.log_201801_3.sql deleted file mode 100644 index 7ee37b5..0000000 --- a/db.log_201801_3.sql +++ /dev/null @@ -1 +0,0 @@ -id < 1727855 diff --git a/dev_run_cli_options_local_table_migration.sh b/dev_run_cli_options_local_table_migration.sh new file mode 100755 index 0000000..3ac6d53 --- /dev/null +++ b/dev_run_cli_options_local_table_migration.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# This script performs migration of one table test.books from local MySQL +# into one table test.books in local ClickHouse +# Tables are created manually by user and are expected by migrator to be in place +# Migrator exists after all data from migrated table is copied into ClickHouse + +# ugly stub to suppress unsufficient sockets +#sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" + +# run data reader with specified Python version + +PYTHON="python3" + +CH_MYSQL="-m clickhouse_mysql.main" + +if [ ! -d "clickhouse_mysql" ]; then + # no clickhouse_mysql dir available - step out of examples dir + cd .. +fi + +MYSQL_USER=reader +MYSQL_PASSWORD=qwerty +SRC_TABLES=test.books +DST_SCHEMA=test +DST_TABLE=books + +MYSQL_USER=user1 +MYSQL_PASSWORD=qwerty +SRC_TABLES=repl.foo +DST_SCHEMA=repl1 +DST_TABLE=foo1 + +$PYTHON $CH_MYSQL ${*:1} \ + --src-server-id=1 \ + --nice-pause=1 \ + --log-level=debug \ + \ + --src-host=127.0.0.1 \ + --src-user="${MYSQL_USER}" \ + --src-password="${MYSQL_PASSWORD}" \ + --src-tables="${SRC_TABLES}" \ + \ + --dst-host=127.0.0.1 \ + --dst-create-table \ + --with-create-database \ + \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=10000 \ + \ + --binlog-position-file=qwe.txt \ + --pump-data \ + --migrate-table \ + --src-wait \ + --src-resume + +# --dst-schema="${DST_SCHEMA}" \ +# --dst-table="${DST_TABLE}" \ +# --dst-table="${DST_SCHEMA}.${DST_TABLE}" \ +# --dst-table-prefix="pr1_" \ +# --log-file=ontime.log \ +# --mempool +# --mempool-max-events-num=3 +# --mempool-max-flush-interval=30 +# --dst-file=dst.csv +# --dst-schema=db +# --dst-table=datatypes +# --csvpool-keep-files +# --log-level=info \ diff --git a/dev_run_config.sh b/dev_run_config_file.sh similarity index 100% rename from dev_run_config.sh rename to dev_run_config_file.sh diff --git a/docs/manual.md b/docs/manual.md new file mode 100644 index 0000000..a064842 --- /dev/null +++ b/docs/manual.md @@ -0,0 +1,1252 @@ +# Table of Contents + + * [Introduction](#introduction) + * [Requirements and Installation](#requirements-and-installation) + + * [PyPi Installation](#pypi-installation) + * [GitHub-based Installation - Clone Sources](#github-based-installation---clone-sources) + * [MySQL setup](#mysql-setup) + * [Quick Start](#quick-start) + * [Operation](#operation) + * [Requirements and Limitations](#requirements-and-limitations) + * [Operation General Schema](#operation-general-schema) + * [Performance](#performance) + * [Examples](#examples) + * [Base Example](#base-example) + * [MySQL Migration Case 1 - with Tables Lock](#mysql-migration-case-1---with-tables-lock) + * [MySQL Migration Case 1 - Create ClickHouse Table](#mysql-migration-case-1---create-clickhouse-table) + * [MySQL Migration Case 1 - Migrate Existing Data](#mysql-migration-case-1---migrate-existing-data) + * [MySQL Migration Case 1 - Listen For New Data](#mysql-migration-case-1---listen-for-new-data) + * [MySQL Migration Case 2 - without Tables Lock](#mysql-migration-case-2---without-tables-lock) + * [MySQL Migration Case 2 - Create ClickHouse Table](#mysql-migration-case-2---create-clickhouse-table) + * [MySQL Migration Case 2 - Listen For New Data](#mysql-migration-case-2---listen-for-new-data) + * [MySQL Migration Case 2 - Migrate Existing Data](#mysql-migration-case-2---migrate-existing-data) + * [airline.ontime Test Case](#airlineontime-test-case) + * [airline.ontime Data Set in CSV files](#airlineontime-data-set-in-csv-files) + * [airline.ontime MySQL Table](#airlineontime-mysql-table) + * [airline.ontime ClickHouse Table](#airlineontime-clickhouse-table) + * [airline.ontime Data Reader](#airlineontime-data-reader) + * [airline.ontime Data Importer](#airlineontime-data-importer) + * [Testing](#testing) + * [Testing General Schema](#testing-general-schema) + * [MySQL Data Types](#mysql-data-types) + * [ClickHouse Data Types](#clickhouse-data-types) + * [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) + * [MySQL Test Tables](#mysql-test-tables) + * [ClickHouse Test Tables](#clickhouse-test-tables) + +--- + +# Introduction + +Utility to import data into ClickHouse from MySQL (mainly) and/or CSV files + +# Requirements and Installation + +Datareader requires at least **Python 3.4** with additional modules to be installed. +In most distributions Python 3 have `pip` utility named as `pip3`, so we'll use this naming. +However, you may have it called differently. + +Datareader can be installed either from `github` repo or from `pypi` repo. + + + +## PyPi Installation +In case you need just to use the app - this is the most convenient way to go. + +Install dependencies. +MySQL repo (for `mysql-community-devel`) +```bash +sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm +``` +epel (for `python3`) +```bash +sudo yum install -y epel-release +``` + +clickhouse-client (for `clickhouse-client`) from Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) +More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) +```bash +curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash +``` +```bash +sudo yum install -y clickhouse-client +``` + +and direct dependencies: +```bash +sudo yum install -y mysql-community-devel +sudo yum install -y mariadb-devel +sudo yum install -y gcc +sudo yum install -y python34-devel python34-pip +``` + +Install data reader +```bash +sudo pip3 install clickhouse-mysql +``` + +Now we are able to call datareader as an app - perform last installation steps - install service files, etc +```bash +[user@localhost ~]$ which clickhouse-mysql +/usr/bin/clickhouse-mysql +/usr/bin/clickhouse-mysql --install +``` + +## GitHub-based Installation - Clone Sources +In case you'd like to play around with the sources this is the way to go. + +Install dependencies: + +`MySQLdb` package is used for communication with MySQL: +```bash +pip3 install mysqlclient +``` + +`mysql-replication` package is used for communication with MySQL also: +[https://github.com/noplay/python-mysql-replication](https://github.com/noplay/python-mysql-replication) +```bash +pip3 install mysql-replication +``` + +`clickhouse-driver` package is used for communication with ClickHouse: +[https://github.com/mymarilyn/clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) +```bash +pip3 install clickhouse-driver +``` + +Clone sources from github +```bash +git clone https://github.com/Altinity/clickhouse-mysql-data-reader +``` + +## MySQL setup + +Also the following (at least one of) MySQL privileges are required for this operation: `SUPER`, `REPLICATION CLIENT` + +```mysql +CREATE USER 'reader'@'%' IDENTIFIED BY 'qwerty'; +CREATE USER 'reader'@'127.0.0.1' IDENTIFIED BY 'qwerty'; +CREATE USER 'reader'@'localhost' IDENTIFIED BY 'qwerty'; +GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'%'; +GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'127.0.0.1'; +GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'localhost'; +FLUSH PRIVILEGES; +``` + +Also the following MySQL config options are required: +```ini +[mysqld] +# mandatory +server-id = 1 +log_bin = /var/lib/mysql/bin.log +binlog-format = row # very important if you want to receive write, update and delete row events +# optional +expire_logs_days = 30 +max_binlog_size = 768M +# setup listen address +bind-address = 0.0.0.0 +``` + +# Quick Start + +Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](../clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) and want to migrate it into ClickHouse. + +Steps to do: + + * Setup MySQL access as described in [MySQL setup](#mysql-setup) + * Run data reader as following: + +```bash +clickhouse-mysql \ + --src-server-id=1 \ + --src-wait \ + --nice-pause=1 \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --src-tables=airline.ontime \ + --dst-host=127.0.0.1 \ + --dst-create-table \ + --migrate-table \ + --pump-data \ + --csvpool +``` + +Expected results are: + * automatically create target table in ClickHouse (if possible) + * migrate existing data from MySQL to ClickHouse + * after migration completed, listen for new events to come and pump data from MySQL into ClickHouse + +Options description + * `--src-server-id` - Master's server id + * `--src-wait` - wait for new data to come + * `--nice-pause=1` - when no data available sleep for 1 second + * `--src-host=127.0.0.1` - MySQL source host + * `--src-user=reader` - MySQL source user (remember about PRIVILEGES for this user) + * `--src-password=qwerty` - MySQL source password (remember about PRIVILEGES for this user) + * `--src-tables=airline.ontime` - list of MySQL source tables to process + * `--dst-host=127.0.0.1` - ClickHouse host + * `--dst-create-table` - create target table automatically + * `--migrate-table` - migrate source tables + * `--pump-data` - pump data from MySQL into ClickHouse after data migrated + * `--csvpool` - make pool of csv files while pumping data (assumes `--mempool` also) + +Choose any combination of `--pump-data`, `--migrate-table`, `--create-table-sql`, `--dst-create-table` + +# Operation + +## Requirements and Limitations + +Data reader understands INSERT SQL statements only. In practice this means that: + * You need to create required table in ClickHouse before starting data read procedure. More on how to create target ClickHouse table: [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) + * From all DML statements INSERT-only are handled, which means: + * UPDATE statements are not handled - meaning UPDATEs within MySQL would not be relayed into ClickHouse + * DELETE statements are not handled - meaning DELETEs within MySQL would not be relayed into ClickHouse + * DDL statements are not handled, which means: + * source table structure change (ALTER TABLE) has to be handled externally and can lead to insertion errors + +## Operation General Schema + + * Step 1. Data Reader reads data from the source event-by-event (for MySQL binlog) or line-by-line (file). + * Step 2. **OPTIONAL** Caching in memory pool. Since ClickHouse prefers to get data in bundles (row-by-row insertion is extremely slow), we need to introduce some caching. + Cache can be flushed by either of: + * number of rows in cache + * number of events in cache + * time elapsed + * data source depleted + * Step 3. **OPTIONAL** Writing CSV file. Sometimes it is useful to have data also represented as a file + * Step 4. Writing data into ClickHouse. Depending on the configuration of the previous steps data are written into ClickHouse by either of: + * directly event-by-event or line-by-line + * from memory cache as a bulk insert operation + * from CSV file via `clickhouse-client` + +## Performance + +`pypy` significantly improves performance. You should try it. Really. Up to **10 times performance boost** can be achieved. +For example you can start with [Portable PyPy distribution for Linux](https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) + - use [Python 3.x release](https://github.com/squeaky-pl/portable-pypy#latest-python-35-release) +Unpack it into your place of choice. + +```bash +[user@localhost ~]$ ls -l pypy3.5-5.9-beta-linux_x86_64-portable +total 32 +drwxr-xr-x 2 user user 140 Oct 24 01:14 bin +drwxr-xr-x 5 user user 4096 Oct 3 11:57 include +drwxr-xr-x 4 user user 4096 Oct 3 11:57 lib +drwxr-xr-x 13 user user 4096 Oct 3 11:56 lib_pypy +drwxr-xr-x 3 user user 15 Oct 3 11:56 lib-python +-rw-r--r-- 1 user user 11742 Oct 3 11:56 LICENSE +-rw-r--r-- 1 user user 1296 Oct 3 11:56 README.rst +drwxr-xr-x 14 user user 4096 Oct 24 01:16 site-packages +drwxr-xr-x 2 user user 195 Oct 3 11:57 virtualenv_support +``` + +Install `pip` +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -m ensurepip +``` +Install required modules +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysql-replication +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install clickhouse-driver +``` +`mysqlclient` may require to install `libmysqlclient-dev` and `gcc` +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysqlclient +``` +Install them if need be +```bash +sudo apt-get install libmysqlclient-dev +``` +```bash +sudo apt-get install gcc +``` + +Now you can run data reader via `pypy` +```bash +/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy clickhouse-mysql +``` + +# Examples + +## Base Example + +Let's walk over test example of tool launch command line options. +This code snippet is taken from shell script (see more details in [airline.ontime Test Case](#airlineontime-test-case)) + +```bash +$PYTHON clickhouse-mysql ${*:1} \ + --src-server-id=1 \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --log-level=info \ + --log-file=ontime.log \ + --src-host=127.0.0.1 \ + --src-user=root \ + --dst-host=127.0.0.1 \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=1000 \ + --pump-data +``` +Options description + * `--src-server-id` - Master's server id + * `--src-resume` - resume data loading from the previous point. When the tool starts - resume from the end of the log + * `--src-wait` - wait for new data to come + * `--nice-pause=1` - when no data available sleep for 1 second + * `--log-level=info` - log verbosity + * `--log-file=ontime.log` - log file name + * `--src-host=127.0.0.1` - MySQL source host + * `--src-user=root` - MySQL source user (remember about PRIVILEGES for this user) + * `--dst-host=127.0.0.1` - ClickHouse host + * `--csvpool` - make pool of csv files (assumes `--mempool` also) + * `--csvpool-file-path-prefix=qwe_` - put these CSV files having `qwe_` prefix in `CWD` + * `--mempool-max-flush-interval=60` - flush mempool at least every 60 seconds + * `--mempool-max-events-num=1000` - flush mempool at least each 1000 events (not rows, but events) + * `--pump-data` - pump data from MySQL into ClickHouse + +## MySQL Migration Case 1 - with Tables Lock + +Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](../clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) with multiple rows: + +```mysql +mysql> SELECT COUNT(*) FROM airline.ontime; ++----------+ +| count(*) | ++----------+ +| 7694964 | ++----------+ +``` + +MySQL is already configured as [described earlier](#mysql-setup). +Let's migrate existing data to ClickHouse and listen for newly coming data in order to migrate them to CLickHouse on-the-fly. + +### MySQL Migration Case 1 - Create ClickHouse Table + +Create ClickHouse table description +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=Qwerty1# \ + --create-table-sql-template \ + --with-create-database \ + --src-tables=airline.ontime > create_clickhouse_table_template.sql +``` +We have **CREATE TABLE** template stored in `create_clickhouse_table_template.sql` file. +```bash +vim create_clickhouse.sql +``` +Setup sharding field and primary key. These columns must not be `Nullable` +```bash mysql +...cut... + `Year` UInt16, +...cut... + `FlightDate` Date, +...cut... + `Month` UInt8, +...cut... +) ENGINE = MergeTree(FlightDate, (FlightDate, Year, Month), 8192) +``` + +Create table in ClickHouse +```bash +clickhouse-client -mn < create_clickhouse_table_template.sql +``` + +### MySQL Migration Case 1 - Migrate Existing Data + +Lock MySQL in order to avoid new data coming while data migration is running. Keep `mysql` client open during the whole process +```mysql +mysql> FLUSH TABLES WITH READ LOCK; +``` + +Migrate data +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=Qwerty1# \ + --migrate-table \ + --src-tables=airline.ontime \ + --dst-host=127.0.0.1 +``` +This may take some time. +Check all data is in ClickHouse +```mysql +:) select count(*) from airline.ontime; + +SELECT count(*) +FROM airline.ontime + +┌─count()─┐ +│ 7694964 │ +└─────────┘ +``` + +### MySQL Migration Case 1 - Listen For New Data + +Start `clickhouse-mysql` as a replication slave, so it will listen for new data coming: +```bash +clickhouse-mysql \ + --src-server-id=1 \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=Qwerty1# \ + --src-tables=airline.ontime \ + --dst-host=127.0.0.1 \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=10000 \ + --pump-data +``` + +Allow new data to be inserted into MySQL - i.e. unlock tables. + +```mysql +mysql> UNLOCK TABLES; +``` + +Insert some data into MySQL. For example, via [clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh](../clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) script + +```mysql +mysql> SELECT COUNT(*) FROM airline.ontime; ++----------+ +| count(*) | ++----------+ +| 10259952 | ++----------+ +``` + +Replication will be pumping data from MySQL into ClickHouse in background and in some time we'll see the following picture in ClickHouse: +```mysql +:) select count(*) from airline.ontime; + +SELECT count(*) +FROM airline.ontime + +┌──count()─┐ +│ 10259952 │ +└──────────┘ +``` + +## MySQL Migration Case 2 - without Tables Lock +Suppose we'd like to migrate multiple log tables of the same structure named as `log_XXX` - i.e. all of them have `log_` name prefix +into one ClickHouse table named `logunified` of the following structure +```sql +DESCRIBE TABLE logunified + +┌─name─┬─type───┬─default_type─┬─default_expression─┐ +│ id │ UInt64 │ │ │ +│ day │ Date │ │ │ +│ str │ String │ │ │ +└──────┴────────┴──────────────┴────────────────────┘ +``` +Log tables by nature are `INSERT`-only tables. Let's migrate these tables. + +### MySQL Migration Case 2 - Create ClickHouse Table +Prepare tables templates in `create_clickhouse.sql` file +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --create-table-sql-template \ + --with-create-database \ + --src-tables-prefixes=db.log_ > create_clickhouse_table_template.sql +``` +Edit templates +```bash +vim create_clickhouse_table_template.sql +``` +And create tables in ClickHouse +```bash + +clickhouse-client -mn < create_clickhouse_table_template.sql +``` + +### MySQL Migration Case 2 - Listen For New Data +```bash +clickhouse-mysql \ + --src-server-id=1 \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --log-level=info \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --src-tables-prefixes=log_ \ + --dst-host=127.0.0.1 \ + --dst-table=logunified \ + --csvpool \ + --pump-data +``` +Pay attention to +```bash + --src-tables-prefixes=log_ \ + --dst-table=logunified \ +``` +Replication data from multiple tables into one destination table `--dst-table=logunified`. + +Monitor logs for `first row in replication` notification of the following structure: +```bash +INFO:first row in replication db.log_201801_2 +column: id=1727834 +column: day=2018-01-20 +column: str=data event 3 +``` +These records help us to create SQL statement for Data Migration process. +Sure, we can peek into MySQL database manually in order to understand what records would be the last to be copied by migration process. + +### MySQL Migration Case 2 - Migrate Existing Data + +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --migrate-table \ + --src-tables-prefixes=db.log_ \ + --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ + --dst-host=127.0.0.1 \ + --dst-table=logunified \ + --csvpool +``` + +Pay attention to +```bash + --src-tables-prefixes=db.log_ \ + --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ + --dst-table=logunified \ +``` +Migration subset of data described in `--src-tables-where-clauses` files from multiple tables into one destination table `--dst-table=logunified` + +Values for where clause in `db.log_201801_1.sql` are fetched from `first row in replication` log: `INFO:first row in replication db.log_201801_1` +```bash +cat db.log_201801_1.sql +id < 1727831 +``` + +Result: +```sql +:) select count(*) from logunified; + +SELECT count(*) +FROM logunified + +┌──count()─┐ +│ 12915568 │ +└──────────┘ + +``` + +## airline.ontime Test Case + +Main Steps + * Download airline.ontime dataset + * Create airline.ontime MySQL table + * Create airline.ontime ClickHouse table + * Start data reader (utility to migrate data MySQL -> ClickHouse) + * Start data importer (utility to import data into MySQL) + * Check how data are loaded into ClickHouse + +### airline.ontime Data Set in CSV files +Run [download script](../clickhouse_mysql_examples/airline_ontime_data_download.sh) + +You may want to adjust dirs where to keep `ZIP` and `CSV` file + +In `airline_ontime_data_download.sh` edit these lines: +```bash +... +ZIP_FILES_DIR="zip" +CSV_FILES_DIR="csv" +... +``` +You may want to adjust number of files to download (In case downloading all it may take some time). + +Specify year and months range as you wish: +```bash +... +echo "Download files into $ZIP_FILES_DIR" +for year in `seq 1987 2017`; do + for month in `seq 1 12`; do +... +``` + +```bash +./airline_ontime_data_download.sh +``` +Downloading can take some time. + +### airline.ontime MySQL Table +Create MySQL table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](../clickhouse_mysql_examples/airline_ontime_schema_mysql.sql): +```bash +mysql -uroot -p < clickhouse_mysql_examples/airline_ontime_schema_mysql.sql +``` + +### airline.ontime ClickHouse Table +Create ClickHouse table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_ch.sql](../clickhouse_mysql_examples/airline_ontime_schema_ch.sql): +```bash +clickhouse-client -mn < clickhouse_mysql_examples/airline_ontime_schema_ch.sql +``` + +### airline.ontime Data Reader +Run [datareader script](../clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh) + +You may want to adjust `PYTHON` path and source and target hosts and usernames +```bash +... +PYTHON=python3.6 +PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy +... +``` +```bash +... + --src-host=127.0.0.1 \ + --src-user=root \ + --dst-host=127.0.0.1 \ +... +``` +```bash +./airline_ontime_data_mysql_to_ch_reader.sh +``` + +### airline.ontime Data Importer +Run [data importer script](../clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) + +You may want to adjust `CSV` files location, number of imported files and MySQL user/password used for import +```bash +... +# looking for csv files in this dir +FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" + +# limit import to this number of files +FILES_TO_IMPORT_NUM=3 +... +``` +```bash +... + -u root \ +... +``` + +```bash +./airline_ontime_mysql_data_import.sh +``` + +# Testing + +## Testing General Schema + +### MySQL Data Types + +#### Numeric Types + + * `BIT` the number of bits per value, from 1 to 64 + * `TINYINT` -128 to 127. The unsigned range is 0 to 255 + * `BOOL`, `BOOLEAN` synonyms for `TINYINT(1)` + * `SMALLINT` -32768 to 32767. The unsigned range is 0 to 65535 + * `MEDIUMINT` -8388608 to 8388607. The unsigned range is 0 to 16777215. + * `INT`, `INTEGER` -2147483648 to 2147483647. The unsigned range is 0 to 4294967295 + * `BIGINT` -9223372036854775808 to 9223372036854775807. The unsigned range is 0 to 18446744073709551615 + + * `SERIAL` is an alias for `BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE`. + * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` A packed ?exact? fixed-point number + * `FLOAT` Permissible values are -3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38 + * `DOUBLE`, `REAL` Permissible values are -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308 + + +#### Date and Time Types + + * `DATE` The supported range is '1000-01-01' to '9999-12-31' + * `DATETIME` The supported range is '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.999999' + * `TIMESTAMP` The range is '1970-01-01 00:00:01.000000' UTC to '2038-01-19 03:14:07.999999' + * `TIME` The range is '-838:59:59.000000' to '838:59:59.000000' + * `YEAR` Values display as 1901 to 2155, and 0000 + +#### String Types + * `CHAR` The range of M is 0 to 255. If M is omitted, the length is 1. + * `VARCHAR` The range of M is 0 to 65,535 + * `BINARY` similar to CHAR + * `VARBINARY` similar to VARCHAR + * `TINYBLOB` maximum length of 255 + * `TINYTEXT` maximum length of 255 + * `BLOB` maximum length of 65,535 + * `TEXT` maximum length of 65,535 + * `MEDIUMBLOB` maximum length of 16,777,215 + * `MEDIUMTEXT` maximum length of 16,777,215 + * `LONGBLOB` maximum length of 4,294,967,295 or 4GB + * `LONGTEXT` maximum length of 4,294,967,295 or 4GB + * `ENUM` can have a maximum of 65,535 distinct elements + * `SET` can have a maximum of 64 distinct members + + * `JSON` native JSON data type defined by RFC 7159 + +--- + +### ClickHouse Data Types + + * `Date` number of days since 1970-01-01 + * `DateTime` Unix timestamp + * `Enum8` or `Enum16`. A set of enumerated string values that are stored as `Int8` or `Int16`. The numeric values must be within -128..127 for Enum8 and -32768..32767 for Enum16 + * `Float32`, `Float64` + + * `Int8` -128 127 + * `UInt8` 0 255 + + * `Int16` -32768 32767 + * `UInt16` 0 65535 + + * `Int32` -2147483648 2147483647 + * `UInt32` 0 4294967295 + + * `Int64` -9223372036854775808 9223372036854775807 + * `UInt64` 0 18446744073709551615 + + * `FixedString(N)` string of `N` bytes (not characters or code points) + * `String` The length is not limited. The value can contain an arbitrary set of bytes, including null bytes + +--- + +### MySQL -> ClickHouse Data Types Mapping + +#### Numeric Types + + * `BIT` -> ??? (possibly `String`?) + * `TINYINT` -> `Int8`, `UInt8` + * `BOOL`, `BOOLEAN` -> `UInt8` + * `SMALLINT` -> `Int16`, `UInt16` + * `MEDIUMINT` -> `Int32`, `UInt32` + * `INT`, `INTEGER` -> `Int32`, `UInt32` + * `BIGINT` -> `Int64`, `UInt64` + + * `SERIAL` -> `UInt64` + * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` -> ???? (possibly `String`?) + * `FLOAT` -> `Float32` + * `DOUBLE`, `REAL` -> `Float64` + + +#### Date and Time Types + + * `DATE` -> `Date` (for valid values) or `String` + `Date` Allows storing values from just after the beginning of the Unix Epoch + to the upper threshold defined by a constant at the compilation stage + (currently, this is until the year 2038, but it may be expanded to 2106) + * `DATETIME` -> `DateTime` (for valid values) or `String` + * `TIMESTAMP` -> `DateTime` + * `TIME` -> ????? (possibly `String`?) + * `YEAR` -> `UInt16` + + +#### String Types + + * `CHAR` -> `FixedString` + * `VARCHAR` -> `String` + * `BINARY` -> `String` + * `VARBINARY` -> `String` + * `TINYBLOB` -> `String` + * `TINYTEXT` -> `String` + * `BLOB` -> `String` + * `TEXT` -> `String` + * `MEDIUMBLOB` -> `String` + * `MEDIUMTEXT` -> `String` + * `LONGBLOB` -> `String` + * `LONGTEXT` -> `String` + +#### Set Types + * `ENUM` -> `Enum8`, `Enum16` + * `SET` -> `Array(Int8)` + +#### Custom Types + * `JSON` -> ?????? (possibly `String`?) + + +### MySQL Test Tables + +We have to separate test table into several ones because of this error, produced by MySQL: +```text +ERROR 1118 (42000): Row size too large. The maximum row size for the used table type, not counting BLOBs, is 65535. This includes storage overhead, check the manual. You have to change some columns to TEXT or BLOBs +``` + +```mysql +CREATE TABLE datatypes( + + bit_1 BIT(1), + bit_2 BIT(64), + + tinyint_1 TINYINT COMMENT '-128 to 127', + u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 BOOL, + bool_2 BOOLEAN, + + smallint_1 SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 INT COMMENT '-2147483648 to 2147483647', + u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', + timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', + time_1 TIME COMMENT '-838:59:59 to 838:59:59', + year_1 YEAR COMMENT '1901 to 2155, and 0000', + + char_0 CHAR(0), + char_1 CHAR(1), + char_2 CHAR(255), + + varchar_0 VARCHAR(0), + varchar_1 VARCHAR(1), + + binary_0 BINARY(0) COMMENT 'similar to CHAR', + binary_1 BINARY(1) COMMENT 'similar to CHAR', + binary_2 BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' +) +; + +CREATE TABLE enum_datatypes( + enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' +) +; + +CREATE TABLE set_datatypes( + set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members' +) +; + +CREATE TABLE json_datatypes( + json_1 JSON +) +; + +CREATE TABLE long_varchar_datatypes( + varchar_2 VARCHAR(65532) +) +; + +CREATE TABLE long_varbinary_datatypes( + varbinary_2 VARBINARY(65532) COMMENT 'similar to VARCHAR' +) +; +``` + + +```mysql +-- in order to be able to set timestamp = '1970-01-01 00:00:01' +set time_zone='+00:00'; +``` + +Insert minimal acceptable values into the test table: + +```mysql +-- MIN values +INSERT INTO datatypes SET + + bit_1 = 0b0, -- BIT(1), + bit_2 = 0b0, -- BIT(64), + + tinyint_1 = -128, -- TINYINT COMMENT '-128 to 127', + u_tinyint_1 = 0, -- TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 = FALSE, -- BOOL, + bool_2 = FALSE, -- BOOLEAN, + + smallint_1 = -32768, -- SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 = 0, -- SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 = -8388608, -- MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 = 0, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 = -2147483648, -- INT COMMENT '-2147483648 to 2147483647', + u_int_1 = 0, -- INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 = -2147483648, -- INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 = 0, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 = -9223372036854775808, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 = 0, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 = 0, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 = -9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 = -9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 = -9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 = -9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 = -3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 = 0, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 = -1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 = 0, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 = -1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 = 0, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 = '1970-01-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 = '1970-01-01 00:00:00', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', + timestamp_1 = '1970-01-01 00:00:01', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', + time_1 = '-838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', + year_1 = 1901, -- YEAR COMMENT '1901 to 2155, and 0000', + + char_0 = '', -- CHAR(0), + char_1 = '', -- CHAR(1), + char_2 = '', -- CHAR(255), + + varchar_0 = '', -- VARCHAR(0), + varchar_1 = '', -- VARCHAR(1), + + binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', + binary_1 = '', -- BINARY(1) COMMENT 'similar to CHAR', + binary_2 = '', -- BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 = '', -- VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 = '', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 = '', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 = '', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 = '', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 = '', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 = '', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 = '', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 = '' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' +; + +INSERT INTO enum_datatypes SET + enum_1 = NULL -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' +; + +INSERT INTO set_datatypes SET + set_1 = '' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members' +; + +INSERT INTO json_datatypes SET + json_1 = '{}' -- JSON +; + +INSERT INTO long_varchar_datatypes SET + varchar_2 = "" +; + +INSERT INTO long_varbinary_datatypes SET + varbinary_2 = "" +; +``` + +Insert maximum acceptable values into the test table: + +```mysql +-- MAX values +INSERT INTO datatypes SET + + bit_1 = 0b1, -- BIT(1), + bit_2 = 0b1111111111111111111111111111111111111111111111111111111111111111, -- BIT(64), + + tinyint_1 = 127, -- TINYINT COMMENT '-128 to 127', + u_tinyint_1 = 255, -- TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 = TRUE, -- BOOL, + bool_2 = TRUE, -- BOOLEAN, + + smallint_1 = 32767, -- SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 = 65535, -- SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 = 8388607, -- MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 = 16777215, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 = 2147483647, -- INT COMMENT '-2147483648 to 2147483647', + u_int_1 = 4294967295, -- INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 = 2147483647, -- INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 = 4294967295, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 = 9223372036854775807, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 = 18446744073709551615, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 = 18446744073709551615, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 = 9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 = 9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 = 9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 = 9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 = 3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 = 3.402823466E+38, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 = 1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 = 1.7976931348623157E+308, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 = 1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 = 1.7976931348623157E+308, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 = '2149-06-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 = '2106-02-01 23:59:59', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', + timestamp_1 = '2038-01-19 03:14:07', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', + time_1 = '838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', + year_1 = 2155, -- YEAR COMMENT '1901 to 2155, and 0000', + + char_0 = '', -- CHAR(0), + char_1 = 'a', -- CHAR(1), + char_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- CHAR(255), + + varchar_0 = '', -- VARCHAR(0), + varchar_1 = 'a', -- VARCHAR(1), + + binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', + binary_1 = 'a', -- BINARY(1) COMMENT 'similar to CHAR', + binary_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 = 'a', -- VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 = 'a', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 = 'a', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 = 'a', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 = 'a', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 = 'a', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 = 'a', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 = 'a', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 = 'a' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' +; + +INSERT INTO enum_datatypes SET + enum_1 = 'a' -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' +; + +INSERT INTO set_datatypes SET + set_1 = 'a,b,c' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members', +; + +INSERT INTO json_datatypes SET + json_1 = '{"a":1, "b":2, "c":3}' -- JSON +; + +INSERT INTO long_varchar_datatypes SET + varchar_2 = "abc" +; + +INSERT INTO long_varbinary_datatypes SET + varbinary_2 = "abc" +; +``` + +### ClickHouse Test Tables + +```sql +CREATE TABLE datatypes( + bit_1 Nullable(String), -- bit_1 BIT(1), + bit_2 Nullable(String), -- bit_2 BIT(64), + + tinyint_1 Nullable(Int8), -- tinyint_1 TINYINT COMMENT '-128 to 127', + u_tinyint_1 Nullable(UInt8), -- u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 Nullable(UInt8), -- bool_1 BOOL, + bool_2 Nullable(UInt8), -- bool_2 BOOLEAN, + + smallint_1 Nullable(Int16), -- smallint_1 SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 Nullable(UInt16), -- u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 Nullable(Int32), -- mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 Nullable(UInt32), -- u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 Nullable(Int32), -- int_1 INT COMMENT '-2147483648 to 2147483647', + u_int_1 Nullable(UInt32), -- u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 Nullable(Int32), -- integer_1 INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 Nullable(UInt32), -- u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 Nullable(Int64), -- bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 Nullable(UInt64), -- u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 Nullable(UInt64), -- serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 Nullable(String), -- decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 Nullable(String), -- dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 Nullable(String), -- fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 Nullable(String), -- numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 Nullable(Float32), -- float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 Nullable(Float32), -- u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 Nullable(Float64), -- double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 Nullable(Float64), -- u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 Nullable(Float64), -- real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 Nullable(Float64), -- u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 Nullable(Date), -- date_1 DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 Nullable(DateTime), -- datetime_1 DATETIME COMMENT '1000-01-01 00:00:00.000000 to 9999-12-31 23:59:59.999999', + timestamp_1 Nullable(DateTime), -- timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01.000000 UTC to 2038-01-19 03:14:07.999999 UTC', + time_1 Nullable(String), -- time_1 TIME COMMENT '-838:59:59.000000 to 838:59:59.000000', + year_1 Nullable(UInt16), -- year_1 YEAR COMMENT '1901 to 2155, and 0000', + + char_0 Nullable(FixedString(1)), -- char_0 CHAR(0), + char_1 Nullable(FixedString(1)), -- char_1 CHAR(1), + char_2 Nullable(FixedString(255)), -- char_2 CHAR(255), + + varchar_0 Nullable(String), -- varchar_0 VARCHAR(0), + varchar_1 Nullable(String), -- varchar_1 VARCHAR(1), + + binary_0 Nullable(String), -- binary_0 BINARY(0) COMMENT 'similar to CHAR', + binary_1 Nullable(String), -- binary_1 BINARY(1) COMMENT 'similar to CHAR', + binary_2 Nullable(String), -- binary_2 BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 Nullable(String), -- varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 Nullable(String), -- varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 Nullable(String), -- tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 Nullable(String), -- tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 Nullable(String), -- blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 Nullable(String), -- text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 Nullable(String), -- mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 Nullable(String), -- mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 Nullable(String), -- longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 Nullable(String) -- longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters', + +) ENGINE = Log +; + +CREATE TABLE enum_datatypes( + enum_1 Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6) -- enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements', +) ENGINE = Memory +; + +CREATE TABLE set_datatypes( + set_1 Array(Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6)) -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', +) ENGINE = Memory +; + +CREATE TABLE set_datatypes( + set_1 String -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', +) ENGINE = Memory +; + + +CREATE TABLE json_datatypes( + json_1 String -- json_1 JSON +) ENGINE = Memory +; + +CREATE TABLE long_varchar_datatypes( + varchar_2 String +) ENGINE = Memory +; + +CREATE TABLE long_varbinary_datatypes( + varbinary_2 String +) ENGINE = Memory +; +``` diff --git a/docs/usage-references.md b/docs/usage-references.md new file mode 100644 index 0000000..195c75b --- /dev/null +++ b/docs/usage-references.md @@ -0,0 +1,19 @@ +## Use Cases +- [3 Step Migration of MySQL data to Clickhouse for faster analytics.](https://mydbops.wordpress.com/2020/02/21/3-step-migration-of-mysql-data-to-clickhouse-for-faster-analytics/) +- [Hybrid OLTP/Analytics Database Workloads: Replicating MySQL Data to ClickHouse](https://severalnines.com/database-blog/hybrid-oltpanalytics-database-workloads-replicating-mysql-data-clickhouse) +- [How to import and replicate data from MySQL toClickHouse](https://minervadb.com/wp-content/uploads/2019/10/How-to-import-and-replicate-data-from-MySQL-to-ClickHouse.pdf) +- [Use Yandex ClickHouse for Analytics with Data from MySQL](https://www.tienle.com/2018/05-04/use-yandex-clickhouse-for-analytics-with-data-from-mysql.html) + +## Talks +- [Opensource Column Store Databases: MariaDB ColumnStore vs. ClickHouse](https://www.percona.com/live/19/sites/default/files/slides/Opensource%20Column%20Store%20Databases_%20MariaDB%20ColumnStore%20vs.%20ClickHouse%20-%20FileId%20-%20188040.pdf) +- [Replicating MySQL Data to TiDB For Near Real-Time Analytics](https://dataops.barcelona/wp-content/uploads/2019/06/Replicating-to-TiDb-francisco-Bordenave.pdf) + +## TODOs and HOWTOs +- [Clickhouse install and use /clickhouse-mysql installation](http://www.programmersought.com/article/7079240138/) +- [Replication from MySQL to ClickHouse](https://www.goplardb.com/post/replication-from-mysql-to-clickhouse) + + +## Other References +- [CH integrations](https://clickhouse.tech/docs/en/interfaces/third-party/integrations/) +- [awesomeopensource](https://awesomeopensource.com/projects/clickhouse) + diff --git a/notes.txt b/notes.txt new file mode 100644 index 0000000..20da4c9 --- /dev/null +++ b/notes.txt @@ -0,0 +1,3 @@ +# Add delete field + +awk -F"," 'BEGIN { OFS = "," } {$45="0"; print}' test.csv > test-out.csv \ No newline at end of file diff --git a/package_clear_old.sh b/package_clear_old.sh index ae0e5e3..abcadff 100755 --- a/package_clear_old.sh +++ b/package_clear_old.sh @@ -1,15 +1,28 @@ #!/bin/bash -TO_DEL="build dist clickhouse_mysql.egg-info deb_dist" +# List of items (files and folders) to be deleted. +# These items are package-related +ITEMS_TO_DEL=" +build +dist +clickhouse_mysql.egg-info +deb_dist +" echo "########################################" echo "### Clear all build and release data ###" echo "########################################" -echo "Deleting:" -for DEL in $TO_DEL; do - echo " $DEL" +echo "About to delete:" +DEL="" +for ITEM in ${ITEMS_TO_DEL}; do + echo " ${ITEM}" + DEL="${DEL} ${ITEM}" done -echo "rm -rf $TO_DEL" -rm -rf $TO_DEL +if [[ -z "${DEL}" ]]; then + echo "No items to delete" +else + echo "rm -rf ${DEL}" + rm -rf ${DEL} +fi diff --git a/package_deb_distr.sh b/package_distr_deb.sh similarity index 61% rename from package_deb_distr.sh rename to package_distr_deb.sh index 90da948..85d84a3 100755 --- a/package_deb_distr.sh +++ b/package_distr_deb.sh @@ -8,5 +8,13 @@ echo "##########################" python3 setup.py --command-packages=stdeb.command bdist_deb +echo "" +echo "" +echo "" +echo "############################" +echo "### Results - .deb files ###" +echo "############################" +ls -la ./deb_dist/*.deb + # pypi stdeb # apt install python3-all python3-stdeb diff --git a/package_rpm_distr.sh b/package_distr_rpm.sh similarity index 57% rename from package_rpm_distr.sh rename to package_distr_rpm.sh index 771e970..59b9cd6 100755 --- a/package_rpm_distr.sh +++ b/package_distr_rpm.sh @@ -2,11 +2,22 @@ ./package_clear_old.sh +echo "##########################" +echo "### Build RPM packages ###" +echo "##########################" + python3 setup.py bdist_rpm --packager="Vladislav Klimenko " # --spec-only -# ls -l ./build/bdist.linux-x86_64/rpm/SPECS/ -# ls -l ./dist/ +echo "" +echo "" +echo "" +echo "######################################" +echo "### Results - .spec and .rpm files ###" +echo "######################################" +ls -la ./build/bdist.linux-x86_64/rpm/SPECS/*.spec +ls -la ./dist/*.rpm + # build RPMs with # rpmbuild -ba ./build/bdist.linux-x86_64/rpm/SPECS/clickhouse-mysql.spec diff --git a/package_source_distr.sh b/package_distr_source.sh similarity index 100% rename from package_source_distr.sh rename to package_distr_source.sh diff --git a/package_wheels_distr.sh b/package_distr_wheels.sh similarity index 100% rename from package_wheels_distr.sh rename to package_distr_wheels.sh diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..da4173a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +clickhouse-driver==0.2.0 +mysql-replication==0.23 +mysqlclient==2.0.3 +PyMySQL==1.0.2 +pytz==2021.1 +tzlocal==2.1 diff --git a/setup.py b/setup.py index 9ceeca0..a8b39e2 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="clickhouse-mysql", # version should comply with PEP440 - version='0.0.20180321', + version='0.0.20200128', description='MySQL to ClickHouse data migrator', long_description='MySQL to ClickHouse data migrator', @@ -20,7 +20,7 @@ license="MIT", - # see https://pypi.python.org/pypi?%3Aaction=list_classifiers + # see https://pypi.python.org/pypi?:action=list_classifiers classifiers=[ # How mature is this project? Common values are # 3 - Alpha @@ -29,6 +29,8 @@ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'Topic :: Database', # should match license above @@ -77,6 +79,8 @@ 'clickhouse-driver', 'configobj', 'setuptools', + 'requests_toolbelt', + 'requests' ], # cross-platform support for pip to create the appropriate form of executable