From e12dda03a10b9e90146fb9cd16a67bade8db0f59 Mon Sep 17 00:00:00 2001 From: Maksim Fedotov Date: Thu, 20 Dec 2018 09:57:44 +0300 Subject: [PATCH 01/67] fix a bug in csvwriter if number of events if equal to mempool-max-events-num, which results in TypeError: map argument #1 must support iteration --- clickhouse_mysql/writer/csvwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 10ec143..eba0928 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -134,7 +134,7 @@ def insert(self, event_or_events): self.writer.writerow(self.convert(row)) def push(self): - if not self.next_writer_builder: + if not self.next_writer_builder or not self.fieldnames: return event = Event() From 3422bb475b0765c7af0a0299cd8591a0d242ab1f Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Fri, 22 Mar 2019 17:51:01 +0800 Subject: [PATCH 02/67] repair create sql error row is a tuple, if try to get value by row['Tables_in_db'],there will have error 'tuple indices must be integers, not str',which only print "Can not list tables on host..." so I change the row['Tables_in_db'] to row[0] then will be work --- clickhouse_mysql/dbclient/mysqlclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse_mysql/dbclient/mysqlclient.py b/clickhouse_mysql/dbclient/mysqlclient.py index aac15ce..5635643 100644 --- a/clickhouse_mysql/dbclient/mysqlclient.py +++ b/clickhouse_mysql/dbclient/mysqlclient.py @@ -100,7 +100,7 @@ def tables_list(self, db): tables = [] for row in self.cursor: logging.debug("table: {}".format(row)) - table_name = row['Tables_in_db'] + table_name = row[0] tables.append(table_name) except: From d00a0db1de8fad677ac62ea14c99e24cf49f5518 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Sat, 23 Mar 2019 14:30:33 +0800 Subject: [PATCH 03/67] =?UTF-8?q?=E4=BC=A0=E5=85=A5dst-schema=E7=9A=84?= =?UTF-8?q?=E6=97=B6=E5=80=99=E7=94=9F=E6=88=90=E7=9A=84table=E5=90=8D?= =?UTF-8?q?=E6=94=B9=E5=8F=98=20=E4=BC=A0=E5=85=A5cluster=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E4=B8=80=E5=8D=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/clioptions.py | 7 +++++++ clickhouse_mysql/config.py | 5 ++++- clickhouse_mysql/dbclient/mysqlclient.py | 4 ++-- clickhouse_mysql/tableprocessor.py | 11 +++++++++-- clickhouse_mysql/tablesqlbuilder.py | 18 ++++++++++-------- 5 files changed, 32 insertions(+), 13 deletions(-) diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 87823c3..77e9bf4 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -142,6 +142,7 @@ class CLIOptions(Options): 'dst_user': 'default', 'dst_password': '', 'dst_schema': None, + 'dst_cluster': None, 'dst_table': None, 'dst_create_table': False, @@ -418,6 +419,12 @@ def options(self): default=self.default_options['dst_schema'], help='Database/schema to be used when writing to dst. Ex.: db1' ) + argparser.add_argument( + '--dst-cluster', + type=str, + default=self.default_options['dst_cluster'], + help='Cluster to be used when writing to dst. Ex.: db1' + ) argparser.add_argument( '--dst-table', type=str, diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index e2041e5..a3f5f7e 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -56,7 +56,6 @@ def __init__(self): print("can't read binlog position from file {}".format( self.options['binlog_position_file'], )) - # build application config out of aggregated options self.config = { # @@ -119,6 +118,8 @@ def __init__(self): 'user': self.options['dst_user'], 'password': self.options['dst_password'], }, + 'dst_schema': self.options['dst_schema'], + 'dst_cluster': self.options['dst_cluster'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, }, @@ -248,6 +249,8 @@ def table_sql_builder(self): user=self.config['table_builder']['mysql']['user'], password=self.config['table_builder']['mysql']['password'], dbs=self.config['table_builder']['mysql']['dbs'], + shema=self.config['table_builder']['clickhouse']['dst_schema'], + cluster=self.config['table_builder']['clickhouse']['dst_cluster'], tables=self.config['table_builder']['mysql']['tables'], tables_prefixes=self.config['table_builder']['mysql']['tables_prefixes'], ) diff --git a/clickhouse_mysql/dbclient/mysqlclient.py b/clickhouse_mysql/dbclient/mysqlclient.py index 5635643..6f8bee1 100644 --- a/clickhouse_mysql/dbclient/mysqlclient.py +++ b/clickhouse_mysql/dbclient/mysqlclient.py @@ -99,11 +99,11 @@ def tables_list(self, db): tables = [] for row in self.cursor: - logging.debug("table: {}".format(row)) table_name = row[0] tables.append(table_name) - except: + except Exception as err: + logging.debug("Unexpected error: {}".format(str(err))) raise Exception("Can not list tables on host={} user={} password={} db={}".format( self.host, self.user, diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index 18aa60d..0d54a3e 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -25,6 +25,8 @@ def __init__( user=None, password=None, dbs=None, + shema=None, + cluster=None, tables=None, tables_prefixes=None, ): @@ -46,6 +48,8 @@ def __init__( 'user': user, 'password': password, }) + self.shema = shema + self.cluster = cluster def dbs_tables_lists(self): """ @@ -131,7 +135,7 @@ def tables_match(self, db, prefix): return res @staticmethod - def create_full_table_name(db=None, table=None): + def create_full_table_name(shema=None, db=None, table=None): """ Create fully-specified table name as `db`.`table` or just `table` @@ -139,7 +143,10 @@ def create_full_table_name(db=None, table=None): :param table: :return: `db`.`table` or just `table` """ - return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) + if shema != None: + return '`{0}`.`{1}`'.format(shema, db+"__"+table) if db else '`{0}`'.format(table) + else: + return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) @staticmethod def is_full_table_name(full_name): diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 6fe4db6..f5e2b93 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -3,6 +3,7 @@ from clickhouse_mysql.tableprocessor import TableProcessor from MySQLdb.cursors import Cursor +import logging class TableSQLBuilder(TableProcessor): """ @@ -24,7 +25,7 @@ def templates(self): } """ dbs = self.dbs_tables_lists() - + logging.debug("cluster: {}, shema: {}".format(self.cluster, self.shema)) if dbs is None: return None @@ -32,11 +33,11 @@ def templates(self): for db in dbs: templates[db] = {} for table in dbs[db]: - templates[db][table] = self.create_table_description(db=db, table=table) + templates[db][table] = self.create_table_description(cluster=self.cluster, shema=self.shema, db=db, table=table) return templates - def create_table_description(self, db=None, table=None): + def create_table_description(self, cluster=None, shema=None, db=None, table=None): """ High-level function. Produce either text ClickHouse's table SQL CREATE TABLE() template or JSON ClikcHouse's table description @@ -47,13 +48,13 @@ def create_table_description(self, db=None, table=None): """ columns_description = self.create_table_columns_description(db=db, table=table) return { - "create_table_template": self.create_table_sql_template(db=db, table=table, columns_description=columns_description), - "create_table": self.create_table_sql(db=db, table=table, columns_description=columns_description), + "create_table_template": self.create_table_sql_template(cluster=cluster, shema=shema, db=db, table=table, columns_description=columns_description), + "create_table": self.create_table_sql(cluster=cluster, shema=shema, db=db, table=table, columns_description=columns_description), "create_database": self.create_database_sql(db=db), "fields": columns_description, } - def create_table_sql_template(self, db=None, table=None, columns_description=None): + def create_table_sql_template(self, cluster=None, shema=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse CREATE TABLE( @@ -80,7 +81,7 @@ def create_table_sql_template(self, db=None, table=None, columns_description=Non ) return sql - def create_table_sql(self, db=None, table=None, columns_description=None): + def create_table_sql(self, cluster=None, shema=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse CREATE TABLE( @@ -119,10 +120,11 @@ def create_table_sql(self, db=None, table=None, columns_description=None): {} ) ENGINE = MergeTree({}, ({}), 8192) """.format( - self.create_full_table_name(db=db, table=table), + self.create_full_table_name(shema=shema, db=db, table=table), ",\n ".join(ch_columns), primary_date_field, ",".join(primary_key_fields), + ) return sql From bd15f432bc3e94993da70b7208477b1aa47c2603 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Sat, 23 Mar 2019 14:33:47 +0800 Subject: [PATCH 04/67] =?UTF-8?q?cluster=E6=B5=8B=E8=AF=95=E8=BE=93?= =?UTF-8?q?=E5=87=BA=E8=AF=AD=E5=8F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/tablesqlbuilder.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index f5e2b93..87805d4 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -74,10 +74,12 @@ def create_table_sql_template(self, cluster=None, shema=None, db=None, table=Non sql = """CREATE TABLE IF NOT EXISTS {} ( {} -) ENGINE = MergeTree(, (), 8192) +) + {} +ENGINE = MergeTree(, (), 8192) """.format( - self.create_full_table_name(db=db, table=table), - ",\n ".join(ch_columns) + self.create_full_table_name(shema=shema, db=db, table=table), + ",\n ".join(ch_columns), "on cluster {}".format(cluster) if cluster != None else "" ) return sql @@ -118,10 +120,13 @@ def create_table_sql(self, cluster=None, shema=None, db=None, table=None, column sql = """CREATE TABLE IF NOT EXISTS {} ( {} -) ENGINE = MergeTree({}, ({}), 8192) +) + {} +ENGINE = MergeTree({}, ({}), 8192) """.format( self.create_full_table_name(shema=shema, db=db, table=table), ",\n ".join(ch_columns), + "on cluster {}".format(cluster) if cluster != None else "", primary_date_field, ",".join(primary_key_fields), From 99c5966f219f624628923a8f30adb364f8fd2561 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Sat, 23 Mar 2019 18:10:24 +0800 Subject: [PATCH 05/67] =?UTF-8?q?=E5=A2=9E=E5=8A=A0cluster=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E6=88=90=E5=8A=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/clioptions.py | 1 + clickhouse_mysql/tablesqlbuilder.py | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 77e9bf4..176d3c5 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -525,6 +525,7 @@ def options(self): 'dst_user': args.dst_user, 'dst_password': args.dst_password, 'dst_schema': args.dst_schema, + 'dst_cluster': args.dst_cluster, 'dst_table': args.dst_table, 'dst_create_table': args.dst_create_table, diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 87805d4..bda09e6 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -72,14 +72,14 @@ def create_table_sql_template(self, cluster=None, shema=None, db=None, table=Non for column_description in columns_description: ch_columns.append('`{}` {}'.format(column_description['field'], column_description['clickhouse_type_nullable'])) - sql = """CREATE TABLE IF NOT EXISTS {} ( + sql = """CREATE TABLE IF NOT EXISTS {} {} ( {} ) - {} ENGINE = MergeTree(, (), 8192) """.format( self.create_full_table_name(shema=shema, db=db, table=table), - ",\n ".join(ch_columns), "on cluster {}".format(cluster) if cluster != None else "" + "on cluster {}".format(cluster) if cluster != None else "", + ",\n ".join(ch_columns), ) return sql @@ -118,15 +118,14 @@ def create_table_sql(self, cluster=None, shema=None, db=None, table=None, column ch_type = column_description['clickhouse_type'] if (field == primary_date_field) or (field in primary_key_fields) else column_description['clickhouse_type_nullable'] ch_columns.append('`{}` {}'.format(field, ch_type)) - sql = """CREATE TABLE IF NOT EXISTS {} ( + sql = """CREATE TABLE IF NOT EXISTS {} {} ( {} ) - {} ENGINE = MergeTree({}, ({}), 8192) """.format( self.create_full_table_name(shema=shema, db=db, table=table), - ",\n ".join(ch_columns), "on cluster {}".format(cluster) if cluster != None else "", + ",\n ".join(ch_columns), primary_date_field, ",".join(primary_key_fields), From d57a803a2d16113d5f5d0ed3ef31bee350fe83a8 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Mon, 25 Mar 2019 17:46:03 +0800 Subject: [PATCH 06/67] =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=9C=A8=E5=88=9B?= =?UTF-8?q?=E5=BB=BAsql=E3=80=81=E5=88=9B=E5=BB=BA=E8=A1=A8=E6=97=B6?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=88=B0=E6=8C=87=E5=AE=9A=E7=9A=84dst-clust?= =?UTF-8?q?er?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/config.py | 5 ++++- clickhouse_mysql/dbclient/mysqlclient.py | 1 + clickhouse_mysql/tablemigrator.py | 13 +++++++------ clickhouse_mysql/tableprocessor.py | 10 +++++----- clickhouse_mysql/tablesqlbuilder.py | 18 +++++++++--------- clickhouse_mysql/writer/chwriter.py | 3 +-- 6 files changed, 27 insertions(+), 23 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index a3f5f7e..317fc4a 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -146,6 +146,7 @@ def __init__(self): 'password': self.options['dst_password'], }, 'dst_schema': self.options['dst_schema'], + 'dst_cluster': self.options['dst_cluster'], 'dst_table': self.options['dst_table'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, @@ -249,7 +250,7 @@ def table_sql_builder(self): user=self.config['table_builder']['mysql']['user'], password=self.config['table_builder']['mysql']['password'], dbs=self.config['table_builder']['mysql']['dbs'], - shema=self.config['table_builder']['clickhouse']['dst_schema'], + schema=self.config['table_builder']['clickhouse']['dst_schema'], cluster=self.config['table_builder']['clickhouse']['dst_cluster'], tables=self.config['table_builder']['mysql']['tables'], tables_prefixes=self.config['table_builder']['mysql']['tables_prefixes'], @@ -271,6 +272,8 @@ def table_migrator(self): user=self.config['table_migrator']['mysql']['user'], password=self.config['table_migrator']['mysql']['password'], dbs=self.config['table_migrator']['mysql']['dbs'], + schema=self.config['table_migrator']['clickhouse']['dst_schema'], + cluster=self.config['table_migrator']['clickhouse']['dst_cluster'], tables=self.config['table_migrator']['mysql']['tables'], tables_prefixes=self.config['table_migrator']['mysql']['tables_prefixes'], tables_where_clauses=self.config['table_migrator']['mysql']['tables_where_clauses'], diff --git a/clickhouse_mysql/dbclient/mysqlclient.py b/clickhouse_mysql/dbclient/mysqlclient.py index 6f8bee1..0b0923d 100644 --- a/clickhouse_mysql/dbclient/mysqlclient.py +++ b/clickhouse_mysql/dbclient/mysqlclient.py @@ -62,6 +62,7 @@ def connect(self, db): passwd=self.password, db=db, cursorclass=self.cursorclass, + charset='utf8', ) self.cursor = self.connection.cursor() logging.debug("Connect to the database host={} user={} password={} db={}".format( diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index b631a0f..3ba176c 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -37,6 +37,8 @@ def __init__( user=None, password=None, dbs=None, + schema=None, + cluster=None, tables=None, tables_prefixes=None, tables_where_clauses=None, @@ -47,6 +49,8 @@ def __init__( user=user, password=password, dbs=dbs, + schema=schema, + cluster=cluster, tables=tables, tables_prefixes=tables_prefixes, ) @@ -195,14 +199,11 @@ def migrate_one_table_data(self, db=None, table=None): self.chwriter.flush() cnt += len(rows) - except: - raise Exception("Can not migrate table on host={} user={} password={} db={} table={} cnt={}".format( - self.host, - self.user, - self.password, + except Exception as ex: + logging.critical("Critical error: {}".format(str(ex))) + raise Exception("Can not migrate table on db={} table={}".format( db, table, - cnt )) return cnt diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index 0d54a3e..ee2d241 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -25,7 +25,7 @@ def __init__( user=None, password=None, dbs=None, - shema=None, + schema=None, cluster=None, tables=None, tables_prefixes=None, @@ -48,7 +48,7 @@ def __init__( 'user': user, 'password': password, }) - self.shema = shema + self.schema = schema self.cluster = cluster def dbs_tables_lists(self): @@ -135,7 +135,7 @@ def tables_match(self, db, prefix): return res @staticmethod - def create_full_table_name(shema=None, db=None, table=None): + def create_full_table_name(schema=None, db=None, table=None): """ Create fully-specified table name as `db`.`table` or just `table` @@ -143,8 +143,8 @@ def create_full_table_name(shema=None, db=None, table=None): :param table: :return: `db`.`table` or just `table` """ - if shema != None: - return '`{0}`.`{1}`'.format(shema, db+"__"+table) if db else '`{0}`'.format(table) + if schema != None: + return '`{0}`.`{1}`'.format(schema, db+"__"+table) if db else '`{0}`'.format(table) else: return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index bda09e6..59bb546 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -25,7 +25,7 @@ def templates(self): } """ dbs = self.dbs_tables_lists() - logging.debug("cluster: {}, shema: {}".format(self.cluster, self.shema)) + logging.debug("cluster: {}, schema: {}".format(self.cluster, self.schema)) if dbs is None: return None @@ -33,11 +33,11 @@ def templates(self): for db in dbs: templates[db] = {} for table in dbs[db]: - templates[db][table] = self.create_table_description(cluster=self.cluster, shema=self.shema, db=db, table=table) + templates[db][table] = self.create_table_description(cluster=self.cluster, schema=self.schema, db=db, table=table) return templates - def create_table_description(self, cluster=None, shema=None, db=None, table=None): + def create_table_description(self, cluster=None, schema=None, db=None, table=None): """ High-level function. Produce either text ClickHouse's table SQL CREATE TABLE() template or JSON ClikcHouse's table description @@ -48,13 +48,13 @@ def create_table_description(self, cluster=None, shema=None, db=None, table=None """ columns_description = self.create_table_columns_description(db=db, table=table) return { - "create_table_template": self.create_table_sql_template(cluster=cluster, shema=shema, db=db, table=table, columns_description=columns_description), - "create_table": self.create_table_sql(cluster=cluster, shema=shema, db=db, table=table, columns_description=columns_description), + "create_table_template": self.create_table_sql_template(cluster=cluster, schema=schema, db=db, table=table, columns_description=columns_description), + "create_table": self.create_table_sql(cluster=cluster, schema=schema, db=db, table=table, columns_description=columns_description), "create_database": self.create_database_sql(db=db), "fields": columns_description, } - def create_table_sql_template(self, cluster=None, shema=None, db=None, table=None, columns_description=None): + def create_table_sql_template(self, cluster=None, schema=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse CREATE TABLE( @@ -77,13 +77,13 @@ def create_table_sql_template(self, cluster=None, shema=None, db=None, table=Non ) ENGINE = MergeTree(, (), 8192) """.format( - self.create_full_table_name(shema=shema, db=db, table=table), + self.create_full_table_name(schema=schema, db=db, table=table), "on cluster {}".format(cluster) if cluster != None else "", ",\n ".join(ch_columns), ) return sql - def create_table_sql(self, cluster=None, shema=None, db=None, table=None, columns_description=None): + def create_table_sql(self, cluster=None, schema=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse CREATE TABLE( @@ -123,7 +123,7 @@ def create_table_sql(self, cluster=None, shema=None, db=None, table=None, column ) ENGINE = MergeTree({}, ({}), 8192) """.format( - self.create_full_table_name(shema=shema, db=db, table=table), + self.create_full_table_name(schema=schema, db=db, table=table), "on cluster {}".format(cluster) if cluster != None else "", ",\n ".join(ch_columns), primary_date_field, diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 96d8030..28abe63 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -67,7 +67,7 @@ def insert(self, event_or_events=None): # determine target schema.table schema = self.dst_schema if self.dst_schema else event_converted.schema - table = self.dst_table if self.dst_table else event_converted.table + table = self.dst_table if self.dst_table else event_converted.schema+"__"+event_converted.table logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) # and INSERT converted rows @@ -84,7 +84,6 @@ def insert(self, event_or_events=None): logging.critical('QUERY FAILED') logging.critical('ex={}'.format(ex)) logging.critical('sql={}'.format(sql)) - logging.critical('rows={}'.format(rows)) sys.exit(0) # all DONE From 46bceae53b52b435df1324e71d02f7760bc4bf6b Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Tue, 26 Mar 2019 11:36:22 +0800 Subject: [PATCH 07/67] =?UTF-8?q?=E4=BC=A0=E5=85=A5dst-distribute=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E6=97=B6=EF=BC=8C=E4=BC=9A=E6=8A=8Aschema=E5=90=8E?= =?UTF-8?q?=E5=8A=A0=5Fall=EF=BC=8Ctable=E5=90=8E=E5=8A=A0=5Fall?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/clioptions.py | 8 ++++++++ clickhouse_mysql/config.py | 4 ++++ clickhouse_mysql/tablemigrator.py | 2 ++ clickhouse_mysql/tableprocessor.py | 11 ++++++++--- clickhouse_mysql/tablesqlbuilder.py | 21 +++++++++++++++++---- 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 176d3c5..7719a31 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -142,6 +142,7 @@ class CLIOptions(Options): 'dst_user': 'default', 'dst_password': '', 'dst_schema': None, + 'dst_distribute': False, 'dst_cluster': None, 'dst_table': None, 'dst_create_table': False, @@ -419,6 +420,12 @@ def options(self): default=self.default_options['dst_schema'], help='Database/schema to be used when writing to dst. Ex.: db1' ) + argparser.add_argument( + '--dst-distribute', + action='store_true', + default=self.default_options['dst_distribute'], + help='is to add distribute table' + ) argparser.add_argument( '--dst-cluster', type=str, @@ -525,6 +532,7 @@ def options(self): 'dst_user': args.dst_user, 'dst_password': args.dst_password, 'dst_schema': args.dst_schema, + 'dst_distribute': args.dst_distribute, 'dst_cluster': args.dst_cluster, 'dst_table': args.dst_table, 'dst_create_table': args.dst_create_table, diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 317fc4a..3775348 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -119,6 +119,7 @@ def __init__(self): 'password': self.options['dst_password'], }, 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], 'dst_cluster': self.options['dst_cluster'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, @@ -146,6 +147,7 @@ def __init__(self): 'password': self.options['dst_password'], }, 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], 'dst_cluster': self.options['dst_cluster'], 'dst_table': self.options['dst_table'], 'dst_create_table': self.options.get_bool('dst_create_table'), @@ -251,6 +253,7 @@ def table_sql_builder(self): password=self.config['table_builder']['mysql']['password'], dbs=self.config['table_builder']['mysql']['dbs'], schema=self.config['table_builder']['clickhouse']['dst_schema'], + distribute=self.config['table_builder']['clickhouse']['dst_distribute'], cluster=self.config['table_builder']['clickhouse']['dst_cluster'], tables=self.config['table_builder']['mysql']['tables'], tables_prefixes=self.config['table_builder']['mysql']['tables_prefixes'], @@ -273,6 +276,7 @@ def table_migrator(self): password=self.config['table_migrator']['mysql']['password'], dbs=self.config['table_migrator']['mysql']['dbs'], schema=self.config['table_migrator']['clickhouse']['dst_schema'], + distribute=self.config['table_migrator']['clickhouse']['dst_distribute'], cluster=self.config['table_migrator']['clickhouse']['dst_cluster'], tables=self.config['table_migrator']['mysql']['tables'], tables_prefixes=self.config['table_migrator']['mysql']['tables_prefixes'], diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 3ba176c..6c7165d 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -38,6 +38,7 @@ def __init__( password=None, dbs=None, schema=None, + distribute=None, cluster=None, tables=None, tables_prefixes=None, @@ -50,6 +51,7 @@ def __init__( password=password, dbs=dbs, schema=schema, + distribute=distribute, cluster=cluster, tables=tables, tables_prefixes=tables_prefixes, diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index ee2d241..e987b2c 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -26,6 +26,7 @@ def __init__( password=None, dbs=None, schema=None, + distribute=None, cluster=None, tables=None, tables_prefixes=None, @@ -50,6 +51,7 @@ def __init__( }) self.schema = schema self.cluster = cluster + self.distribute = distribute def dbs_tables_lists(self): """ @@ -135,15 +137,18 @@ def tables_match(self, db, prefix): return res @staticmethod - def create_full_table_name(schema=None, db=None, table=None): + def create_full_table_name(schema=None, db=None, table=None, distribute=None): """ - Create fully-specified table name as `db`.`table` or just `table` + Create fully-specified table name as `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` :param db: :param table: - :return: `db`.`table` or just `table` + :return: `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` """ if schema != None: + if distribute: + schema += "_all" + table += "_all" return '`{0}`.`{1}`'.format(schema, db+"__"+table) if db else '`{0}`'.format(table) else: return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 59bb546..742c9ef 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -121,13 +121,12 @@ def create_table_sql(self, cluster=None, schema=None, db=None, table=None, colum sql = """CREATE TABLE IF NOT EXISTS {} {} ( {} ) -ENGINE = MergeTree({}, ({}), 8192) +{} """.format( - self.create_full_table_name(schema=schema, db=db, table=table), + self.create_full_table_name(schema=schema, db=db, table=table, distribute=self.distribute), "on cluster {}".format(cluster) if cluster != None else "", ",\n ".join(ch_columns), - primary_date_field, - ",".join(primary_key_fields), + self.create_table_engine(self.cluster, self.schema, db+"__"+table, primary_date_field+"_all", ",".join(primary_key_fields), self.distribute), ) return sql @@ -335,6 +334,20 @@ def map_type_nullable(self, mysql_type, nullable=False): return ch_type + def create_table_engine(self, cluster=None, dst_schema=None, dst_table=None,primary_date_field=None, primary_key_fields=None, distribute=None): + logging.debug("cluster={}, dst_schema={}, dst_table={},primary_date_field={}, primary_key_fields={}, distribute={}" + .format(cluster, dst_schema, dst_table,primary_date_field, primary_key_fields, distribute)) + if distribute : + return "ENGINE = Distributed({}, '{}', '{}', rand())".format( + cluster, + dst_schema, + dst_table + ) + else: + return "ENGINE = MergeTree({}, ({}), 8192)".format( + primary_date_field, + primary_key_fields) + if __name__ == '__main__': tb = TableSQLBuilder( host='127.0.0.1', From 8a97a1e87feea0e48ec43baffe03ae23257a04d2 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Tue, 26 Mar 2019 11:42:17 +0800 Subject: [PATCH 08/67] update CHANGELOG.md --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5425f30..b4d6bea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# clickhouse-mysql 2019-03-25 + +## new features +* added new CLI option `--dst-schema` - make table full name change to `schema`.`db__table` +* added new CLI option `--dst-cluster` - support table create on cluster +* added new CLI option `--dst-distribute` - make table full name change to `schema_all`.`db__table_all`,and engine change to Distributed + # clickhouse-mysql 2018-03-14 ## new features From a2067cefe99080e16f2e97758a545fd19d231faa Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Wed, 3 Apr 2019 18:11:12 +0800 Subject: [PATCH 09/67] =?UTF-8?q?=E4=BC=A0dst-distribute=E6=97=B6=E4=B8=8D?= =?UTF-8?q?=E6=96=B0=E5=A2=9Eon=20cluster=E8=AF=AD=E5=8F=A5=EF=BC=8C?= =?UTF-8?q?=E4=BC=A0dst-distribute=E6=94=AF=E6=8C=81=E6=89=B9=E9=87=8F?= =?UTF-8?q?=E5=86=99=E5=85=A5=E8=A1=A8=E6=95=B0=E6=8D=AE=EF=BC=8C=E5=90=8C?= =?UTF-8?q?=E6=97=B6=E4=BF=AE=E5=A4=8DDecimal=E4=B8=8D=E8=83=BD=E8=BD=AC?= =?UTF-8?q?=E6=8D=A2=E4=B8=BAString=E7=9A=84=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/config.py | 2 ++ clickhouse_mysql/tablesqlbuilder.py | 4 ++-- clickhouse_mysql/writer/chwriter.py | 24 ++++++++++++++++++++++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 3775348..51dda1a 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -194,6 +194,7 @@ def __init__(self): }, 'dst_schema': self.options['dst_schema'], 'dst_table': self.options['dst_table'], + 'dst_distribute': self.options['dst_distribute'], }, 'file': { 'csv_file_path': self.options['dst_file'], @@ -372,6 +373,7 @@ def writer_builder_chwriter(self): }, 'dst_schema': self.config['writer']['clickhouse']['dst_schema'], 'dst_table': self.config['writer']['clickhouse']['dst_table'], + 'dst_distribute': self.config['writer']['clickhouse']['dst_distribute'], 'next_writer_builder': None, 'converter_builder': self.converter_builder(CONVERTER_CH), }) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 742c9ef..49927a4 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -124,9 +124,9 @@ def create_table_sql(self, cluster=None, schema=None, db=None, table=None, colum {} """.format( self.create_full_table_name(schema=schema, db=db, table=table, distribute=self.distribute), - "on cluster {}".format(cluster) if cluster != None else "", + "on cluster {}".format(cluster) if not self.distribute and cluster != None else "", ",\n ".join(ch_columns), - self.create_table_engine(self.cluster, self.schema, db+"__"+table, primary_date_field+"_all", ",".join(primary_key_fields), self.distribute), + self.create_table_engine(self.cluster, self.schema, db+"__"+table, primary_date_field, ",".join(primary_key_fields), self.distribute), ) return sql diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 28abe63..5638795 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -4,6 +4,8 @@ import logging import sys +from decimal import Decimal + from clickhouse_mysql.dbclient.chclient import CHClient from clickhouse_mysql.writer.writer import Writer @@ -16,19 +18,26 @@ class CHWriter(Writer): client = None dst_schema = None dst_table = None + dst_distribute = None def __init__( self, connection_settings, dst_schema=None, dst_table=None, + dst_distribute=False, next_writer_builder=None, converter_builder=None, ): - logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, dst_table)) + if dst_distribute and dst_schema is not None: + dst_schema += "_all" + if dst_distribute and dst_table is not None: + dst_table += "_all" + logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={} dst_distribute={}".format(connection_settings, dst_schema, dst_table, dst_distribute)) self.client = CHClient(connection_settings) self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_distribute = dst_distribute def insert(self, event_or_events=None): # event_or_events = [ @@ -60,6 +69,10 @@ def insert(self, event_or_events=None): event_converted = self.convert(event) for row in event_converted: + for key in row.keys(): + # we need to convert Decimal value to str value for suitable for table structure + if (type(row[key]) == Decimal): + row[key] = str(row[key]) rows.append(row) logging.debug('class:%s insert %d row(s)', __class__, len(rows)) @@ -67,7 +80,14 @@ def insert(self, event_or_events=None): # determine target schema.table schema = self.dst_schema if self.dst_schema else event_converted.schema - table = self.dst_table if self.dst_table else event_converted.schema+"__"+event_converted.table + table = None + if self.dst_table: + table = self.dst_table + elif self.dst_distribute: + # if current is going to insert distributed table,we need '_all' suffix + table = event_converted.schema + "__" + event_converted.table + "_all" + else: + table = event_converted.schema + "__" + event_converted.table logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) # and INSERT converted rows From f321c3d8c78e277b0f42ab08922c2ba41e49e375 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Fri, 12 Apr 2019 10:50:27 +0800 Subject: [PATCH 10/67] =?UTF-8?q?=E4=BC=A0column=5Fskip=E6=97=B6=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=88=9B=E5=BB=BA=E8=A1=A8=E3=80=81=E5=AF=BC=E5=85=A5?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=97=B6=EF=BC=8C=E8=B7=B3=E8=BF=87=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E5=88=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/config.py | 6 ++++++ clickhouse_mysql/tablemigrator.py | 26 ++++++++++++++++++++++---- clickhouse_mysql/tableprocessor.py | 3 +++ clickhouse_mysql/tablesqlbuilder.py | 4 ++++ 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 51dda1a..658925a 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import logging from clickhouse_mysql.reader.mysqlreader import MySQLReader from clickhouse_mysql.reader.csvreader import CSVReader @@ -110,6 +111,7 @@ def __init__(self): 'dbs': self.options.get_list('src_schemas'), 'tables': self.options.get_list('src_tables'), 'tables_prefixes': self.options.get_list('src_tables_prefixes'), + 'column_skip': self.options['column_skip'] }, 'clickhouse': { 'connection_settings': { @@ -138,6 +140,7 @@ def __init__(self): 'tables': self.options.get_list('src_tables'), 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 'tables_where_clauses': self.options.get_list('src_tables_where_clauses'), + 'column_skip': self.options['column_skip'] }, 'clickhouse': { 'connection_settings': { @@ -247,6 +250,7 @@ def is_install(self): return self.config['app']['install'] def table_sql_builder(self): + logging.debug("----config column_skip: %s",self.config['converter']['clickhouse']['column_skip']) return TableSQLBuilder( host=self.config['table_builder']['mysql']['host'], port=self.config['table_builder']['mysql']['port'], @@ -258,6 +262,7 @@ def table_sql_builder(self): cluster=self.config['table_builder']['clickhouse']['dst_cluster'], tables=self.config['table_builder']['mysql']['tables'], tables_prefixes=self.config['table_builder']['mysql']['tables_prefixes'], + column_skip=self.config['converter']['clickhouse']['column_skip'], ) def is_migrate_table(self): @@ -282,6 +287,7 @@ def table_migrator(self): tables=self.config['table_migrator']['mysql']['tables'], tables_prefixes=self.config['table_migrator']['mysql']['tables_prefixes'], tables_where_clauses=self.config['table_migrator']['mysql']['tables_where_clauses'], + column_skip=self.config['converter']['clickhouse']['column_skip'], ) table_migrator.chwriter = self.writer_builder_chwriter().get() table_migrator.chclient = self.chclient() diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 6c7165d..cb6dcfb 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -3,7 +3,7 @@ import logging -from MySQLdb.cursors import SSDictCursor +from MySQLdb.cursors import SSDictCursor,Cursor from clickhouse_mysql.tableprocessor import TableProcessor from clickhouse_mysql.tablesqlbuilder import TableSQLBuilder from clickhouse_mysql.event.event import Event @@ -43,6 +43,7 @@ def __init__( tables=None, tables_prefixes=None, tables_where_clauses=None, + column_skip=[], ): super().__init__( host=host, @@ -55,6 +56,7 @@ def __init__( cluster=cluster, tables=tables, tables_prefixes=tables_prefixes, + column_skip=column_skip ) self.client.cursorclass = SSDictCursor @@ -69,6 +71,7 @@ def __init__( # ] # debug info + logging.debug("column_skip={}".format(column_skip)) logging.info("tables_where_clauses={}".format(tables_where_clauses)) for table_where in tables_where_clauses: logging.info("table_where={}".format(table_where)) @@ -173,17 +176,20 @@ def migrate_one_table_data(self, db=None, table=None): :return: number of migrated rows """ - self.client.cursorclass = SSDictCursor - self.client.connect(db=db) # build SQL statement - sql = "SELECT * FROM {0}".format(self.create_full_table_name(db=db, table=table)) + full_table_name = self.create_full_table_name(db=db, table=table) + sql = "SELECT {0} FROM {1}".format( + ",".join(self.get_columns(db,full_table_name)) + ,full_table_name) # in case we have WHERE clause for this db.table - add it to SQL if db in self.where_clauses and table in self.where_clauses[db]: sql += " WHERE {}".format(self.where_clauses[db][table]) try: logging.info("migrate_table. sql={}".format(sql)) + self.client.cursorclass = SSDictCursor + self.client.connect(db=db) self.client.cursor.execute(sql) cnt = 0; while True: @@ -210,6 +216,18 @@ def migrate_one_table_data(self, db=None, table=None): return cnt + def get_columns(self,db,full_table_name): + self.client.cursorclass = Cursor + self.client.connect(db=db) + self.client.cursor.execute("DESC {}".format(full_table_name)) + fields = [] + for (_field, _type, _null, _key, _default, _extra,) in self.client.cursor: + logging.debug("遍历表结构%s:%s,%s,%s,%s,%s,%s",full_table_name,_field, _type, _null, _key, _default, _extra) + if self.column_skip.__contains__(_field): + logging.debug("跳过%s",_field) + continue + fields.append(_field) + return fields if __name__ == '__main__': tb = TableBuilder( diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index e987b2c..91a097f 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -30,6 +30,7 @@ def __init__( cluster=None, tables=None, tables_prefixes=None, + column_skip=None, ): """ :param host: string MySQL host @@ -52,6 +53,8 @@ def __init__( self.schema = schema self.cluster = cluster self.distribute = distribute + logging.debug("------column_skip: %s", column_skip) + self.column_skip = column_skip def dbs_tables_lists(self): """ diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 49927a4..4540488 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -25,6 +25,7 @@ def templates(self): } """ dbs = self.dbs_tables_lists() + logging.debug("------column_skip: %s", self.column_skip) logging.debug("cluster: {}, schema: {}".format(self.cluster, self.schema)) if dbs is None: return None @@ -168,6 +169,9 @@ def create_table_columns_description(self, db=None, table=None, ): # build ready-to-sql column specification Ex.: # `integer_1` Nullable(Int32) # `u_integer_1` Nullable(UInt32) + if self.column_skip.__contains__(_field): + logging.debug("跳过%s",_field) + continue columns_description.append({ 'field': _field, 'mysql_type': _type, From f69f0384231506e406547e5369ff94086d551c14 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Fri, 12 Apr 2019 10:52:22 +0800 Subject: [PATCH 11/67] =?UTF-8?q?=E5=88=A0=E9=99=A4=E4=B8=8D=E5=BF=85?= =?UTF-8?q?=E8=A6=81=E7=9A=84logging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/config.py | 2 -- clickhouse_mysql/tablemigrator.py | 4 +--- clickhouse_mysql/tableprocessor.py | 1 - clickhouse_mysql/tablesqlbuilder.py | 5 +---- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 658925a..91d79d8 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import logging from clickhouse_mysql.reader.mysqlreader import MySQLReader from clickhouse_mysql.reader.csvreader import CSVReader @@ -250,7 +249,6 @@ def is_install(self): return self.config['app']['install'] def table_sql_builder(self): - logging.debug("----config column_skip: %s",self.config['converter']['clickhouse']['column_skip']) return TableSQLBuilder( host=self.config['table_builder']['mysql']['host'], port=self.config['table_builder']['mysql']['port'], diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index cb6dcfb..86cd7a7 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -71,7 +71,6 @@ def __init__( # ] # debug info - logging.debug("column_skip={}".format(column_skip)) logging.info("tables_where_clauses={}".format(tables_where_clauses)) for table_where in tables_where_clauses: logging.info("table_where={}".format(table_where)) @@ -222,9 +221,8 @@ def get_columns(self,db,full_table_name): self.client.cursor.execute("DESC {}".format(full_table_name)) fields = [] for (_field, _type, _null, _key, _default, _extra,) in self.client.cursor: - logging.debug("遍历表结构%s:%s,%s,%s,%s,%s,%s",full_table_name,_field, _type, _null, _key, _default, _extra) if self.column_skip.__contains__(_field): - logging.debug("跳过%s",_field) + logging.debug("skip column %s",_field) continue fields.append(_field) return fields diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index 91a097f..6a353fe 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -53,7 +53,6 @@ def __init__( self.schema = schema self.cluster = cluster self.distribute = distribute - logging.debug("------column_skip: %s", column_skip) self.column_skip = column_skip def dbs_tables_lists(self): diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 4540488..974f559 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -25,8 +25,6 @@ def templates(self): } """ dbs = self.dbs_tables_lists() - logging.debug("------column_skip: %s", self.column_skip) - logging.debug("cluster: {}, schema: {}".format(self.cluster, self.schema)) if dbs is None: return None @@ -170,7 +168,7 @@ def create_table_columns_description(self, db=None, table=None, ): # `integer_1` Nullable(Int32) # `u_integer_1` Nullable(UInt32) if self.column_skip.__contains__(_field): - logging.debug("跳过%s",_field) + logging.debug("table sql builder skip column %s",_field) continue columns_description.append({ 'field': _field, @@ -339,7 +337,6 @@ def map_type_nullable(self, mysql_type, nullable=False): return ch_type def create_table_engine(self, cluster=None, dst_schema=None, dst_table=None,primary_date_field=None, primary_key_fields=None, distribute=None): - logging.debug("cluster={}, dst_schema={}, dst_table={},primary_date_field={}, primary_key_fields={}, distribute={}" .format(cluster, dst_schema, dst_table,primary_date_field, primary_key_fields, distribute)) if distribute : return "ENGINE = Distributed({}, '{}', '{}', rand())".format( From abe969d69829ce00336d3522f38c014f935c3fe9 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Tue, 23 Apr 2019 17:41:06 +0800 Subject: [PATCH 12/67] =?UTF-8?q?column=5Fskip=E9=BB=98=E8=AE=A4=E5=80=BC?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=BA=E7=A9=BA=E6=95=B0=E7=BB=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/clioptions.py | 2 +- clickhouse_mysql/tableprocessor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 7719a31..6d8085c 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -151,7 +151,7 @@ class CLIOptions(Options): # converters section # 'column_default_value': None, - 'column_skip': None, + 'column_skip': [], 'ch_converter_file': None, 'ch_converter_class': None, } diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index 6a353fe..486026b 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -30,7 +30,7 @@ def __init__( cluster=None, tables=None, tables_prefixes=None, - column_skip=None, + column_skip=[], ): """ :param host: string MySQL host From fb16a2bdf98430ac448e6e3101c06fe458715245 Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Tue, 30 Apr 2019 18:13:50 +0800 Subject: [PATCH 13/67] =?UTF-8?q?=E5=88=A0=E9=99=A4=E4=B8=8D=E5=BF=85?= =?UTF-8?q?=E8=A6=81=E7=9A=84logging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/tablesqlbuilder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 974f559..06454c9 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -337,7 +337,6 @@ def map_type_nullable(self, mysql_type, nullable=False): return ch_type def create_table_engine(self, cluster=None, dst_schema=None, dst_table=None,primary_date_field=None, primary_key_fields=None, distribute=None): - .format(cluster, dst_schema, dst_table,primary_date_field, primary_key_fields, distribute)) if distribute : return "ENGINE = Distributed({}, '{}', '{}', rand())".format( cluster, From ec3aa80e242a4f824146b37ad4636ad237303c1f Mon Sep 17 00:00:00 2001 From: Wing-Lo Date: Wed, 15 May 2019 14:21:26 +0800 Subject: [PATCH 14/67] =?UTF-8?q?bug-=E5=AE=9E=E6=97=B6=E5=AF=BC=E5=85=A5?= =?UTF-8?q?=E8=A1=A8=E6=97=B6=E4=BC=9A=E5=87=BA=E7=8E=B0=E7=9A=84=E9=94=99?= =?UTF-8?q?=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- clickhouse_mysql/tablesqlbuilder.py | 19 ++++++++++++------- clickhouse_mysql/writer/chcsvwriter.py | 16 +++++++++++++++- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 06454c9..4cb90bc 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -101,10 +101,10 @@ def create_table_sql(self, cluster=None, schema=None, db=None, table=None, colum primary_date_field = self.fetch_primary_date_field(columns_description) primary_key_fields = self.fetch_primary_key_fields(columns_description) - if primary_date_field is None: - # No primary date field found. Make one - primary_date_field = 'primary_date_field' - ch_columns.append('`primary_date_field` Date default today()') + # if primary_date_field is None: + # # No primary date field found. Make one + # primary_date_field = 'primary_date_field' + # ch_columns.append('`primary_date_field` Date default today()') if primary_key_fields is None: # No primary key fields found. Make PK from primary date field @@ -192,6 +192,8 @@ def fetch_primary_date_field(self, columns_description): for column_description in columns_description: if (column_description['clickhouse_type'] == 'Date'): return column_description['field'] + if (column_description['clickhouse_type'] == 'DateTime'): + return column_description['field'] return None @@ -344,9 +346,12 @@ def create_table_engine(self, cluster=None, dst_schema=None, dst_table=None,prim dst_table ) else: - return "ENGINE = MergeTree({}, ({}), 8192)".format( - primary_date_field, - primary_key_fields) + engine = "ENGINE = ReplacingMergeTree() " + if primary_date_field is not None: + engine += "PARTITION BY toYYYYMM({}) ".format(primary_date_field) + if primary_key_fields is not None: + engine += "ORDER BY ({})".format(primary_key_fields) + return engine if __name__ == '__main__': tb = TableSQLBuilder( diff --git a/clickhouse_mysql/writer/chcsvwriter.py b/clickhouse_mysql/writer/chcsvwriter.py index 605544e..8b8ff00 100644 --- a/clickhouse_mysql/writer/chcsvwriter.py +++ b/clickhouse_mysql/writer/chcsvwriter.py @@ -13,6 +13,7 @@ class CHCSVWriter(Writer): dst_schema = None dst_table = None + dst_distribute = None host = None port = None @@ -24,7 +25,12 @@ def __init__( connection_settings, dst_schema=None, dst_table=None, + dst_distribute=False, ): + if dst_distribute and dst_schema is not None: + dst_schema += "_all" + if dst_distribute and dst_table is not None: + dst_table += "_all" logging.info("CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, dst_table)) self.host = connection_settings['host'] self.port = connection_settings['port'] @@ -32,6 +38,7 @@ def __init__( self.password = connection_settings['password'] self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_distribute = dst_distribute def insert(self, event_or_events=None): # event_or_events = [ @@ -54,7 +61,14 @@ def insert(self, event_or_events=None): for event in events: schema = self.dst_schema if self.dst_schema else event.schema - table = self.dst_table if self.dst_table else event.table + table = None + if self.dst_table: + table = self.dst_table + elif self.dst_distribute: + # if current is going to insert distributed table,we need '_all' suffix + table = event.schema + "__" + event.table + "_all" + else: + table = event.schema + "__" + event.table sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( schema, From 0f4b91a851c2611ad5f9c0839f5dc3cd397cda38 Mon Sep 17 00:00:00 2001 From: Anton M Date: Wed, 17 Jul 2019 21:41:19 +0300 Subject: [PATCH 15/67] Fix error File not found When I am using the `--src-tables-where-clauses` parameter I alltime getting error `File not found`. In the instruction I see `Comma-separated list of WHERE clauses for tables to be migrated. Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4"` but in the code it all time try open some file. --- clickhouse_mysql/tablemigrator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 86cd7a7..8a16058 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import logging +import os.path from MySQLdb.cursors import SSDictCursor,Cursor from clickhouse_mysql.tableprocessor import TableProcessor @@ -100,7 +101,11 @@ def __init__( db, table = TableProcessor.parse_full_table_name(full_table_name) if not db in self.where_clauses: self.where_clauses[db] = {} - self.where_clauses[db][table] = open(where_file_name, 'r').read().strip("\n") + + if os.path.isfile(where_file_name): + self.wheres[db][table] = open(where_file_name,'r').read().strip("\n") + else: + self.wheres[db][table] = where_file_name # debug info logging.info("migration where clauses") From 0d4f66ad56841f6ed21de283f3e525c0f674e627 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Wed, 21 Aug 2019 17:36:06 +0300 Subject: [PATCH 16/67] dev: review UTF8 --- clickhouse_mysql/dbclient/mysqlclient.py | 1 + clickhouse_mysql/tablemigrator.py | 4 +- dev_run_cli_options_local_table_migration.sh | 48 ++++++++++++++++++++ setup.py | 2 +- 4 files changed, 51 insertions(+), 4 deletions(-) create mode 100755 dev_run_cli_options_local_table_migration.sh diff --git a/clickhouse_mysql/dbclient/mysqlclient.py b/clickhouse_mysql/dbclient/mysqlclient.py index 0b0923d..9dc258a 100644 --- a/clickhouse_mysql/dbclient/mysqlclient.py +++ b/clickhouse_mysql/dbclient/mysqlclient.py @@ -63,6 +63,7 @@ def connect(self, db): db=db, cursorclass=self.cursorclass, charset='utf8', + use_unicode=True, ) self.cursor = self.connection.cursor() logging.debug("Connect to the database host={} user={} password={} db={}".format( diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 8a16058..59f9871 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -183,9 +183,7 @@ def migrate_one_table_data(self, db=None, table=None): # build SQL statement full_table_name = self.create_full_table_name(db=db, table=table) - sql = "SELECT {0} FROM {1}".format( - ",".join(self.get_columns(db,full_table_name)) - ,full_table_name) + sql = "SELECT {0} FROM {1}".format(",".join(self.get_columns(db, full_table_name)), full_table_name) # in case we have WHERE clause for this db.table - add it to SQL if db in self.where_clauses and table in self.where_clauses[db]: sql += " WHERE {}".format(self.where_clauses[db][table]) diff --git a/dev_run_cli_options_local_table_migration.sh b/dev_run_cli_options_local_table_migration.sh new file mode 100755 index 0000000..bcd984a --- /dev/null +++ b/dev_run_cli_options_local_table_migration.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# This script performs migration of one table test.books from local MySQL +# into one table test.books in local ClickHouse +# Tables are created manually by user and are expected by migrator to be in place +# Migrator exists after all data from migrated table is copied into ClickHouse + +# ugly stub to suppress unsufficient sockets +#sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" + +# run data reader with specified Python version + +PYTHON="python3" + +CH_MYSQL="-m clickhouse_mysql.main" + +if [ ! -d "clickhouse_mysql" ]; then + # no clickhouse_mysql dir available - step out of examples dir + cd .. +fi + +$PYTHON $CH_MYSQL ${*:1} \ + --src-server-id=1 \ + --nice-pause=1 \ + --log-level=debug \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --src-tables=test.books \ + --dst-host=127.0.0.1 \ + --dst-schema=test \ + --dst-table=books \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=10000 \ + --pump-data \ + --migrate-table + +# --log-file=ontime.log \ +# --mempool +# --mempool-max-events-num=3 +# --mempool-max-flush-interval=30 +# --dst-file=dst.csv +# --dst-schema=db +# --dst-table=datatypes +# --csvpool-keep-files +# --log-level=info \ diff --git a/setup.py b/setup.py index 9ceeca0..60c7355 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="clickhouse-mysql", # version should comply with PEP440 - version='0.0.20180321', + version='0.0.20190821', description='MySQL to ClickHouse data migrator', long_description='MySQL to ClickHouse data migrator', From 4be266d98487d80cd39a7ebba18f75207ed78d45 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 26 Aug 2019 13:09:36 +0300 Subject: [PATCH 17/67] env: enhance dev scripts --- dev_run_config.sh => dev_run_config_file.sh | 0 package_clear_old.sh | 25 ++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) rename dev_run_config.sh => dev_run_config_file.sh (100%) diff --git a/dev_run_config.sh b/dev_run_config_file.sh similarity index 100% rename from dev_run_config.sh rename to dev_run_config_file.sh diff --git a/package_clear_old.sh b/package_clear_old.sh index ae0e5e3..abcadff 100755 --- a/package_clear_old.sh +++ b/package_clear_old.sh @@ -1,15 +1,28 @@ #!/bin/bash -TO_DEL="build dist clickhouse_mysql.egg-info deb_dist" +# List of items (files and folders) to be deleted. +# These items are package-related +ITEMS_TO_DEL=" +build +dist +clickhouse_mysql.egg-info +deb_dist +" echo "########################################" echo "### Clear all build and release data ###" echo "########################################" -echo "Deleting:" -for DEL in $TO_DEL; do - echo " $DEL" +echo "About to delete:" +DEL="" +for ITEM in ${ITEMS_TO_DEL}; do + echo " ${ITEM}" + DEL="${DEL} ${ITEM}" done -echo "rm -rf $TO_DEL" -rm -rf $TO_DEL +if [[ -z "${DEL}" ]]; then + echo "No items to delete" +else + echo "rm -rf ${DEL}" + rm -rf ${DEL} +fi From 5c5148a61dfb6c0afdf971aa76cb92af9424e158 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 26 Aug 2019 13:36:39 +0300 Subject: [PATCH 18/67] env: enhance RPM and deb packager --- .gitignore | 3 ++- package_deb_distr.sh => package_distr_deb.sh | 8 ++++++++ package_rpm_distr.sh => package_distr_rpm.sh | 15 +++++++++++++-- ...age_source_distr.sh => package_distr_source.sh | 0 ...age_wheels_distr.sh => package_distr_wheels.sh | 0 5 files changed, 23 insertions(+), 3 deletions(-) rename package_deb_distr.sh => package_distr_deb.sh (61%) rename package_rpm_distr.sh => package_distr_rpm.sh (57%) rename package_source_distr.sh => package_distr_source.sh (100%) rename package_wheels_distr.sh => package_distr_wheels.sh (100%) diff --git a/.gitignore b/.gitignore index 9f44b89..eb18180 100644 --- a/.gitignore +++ b/.gitignore @@ -4,12 +4,13 @@ *.egg *.egg-info dist +sdist +deb_dist build eggs parts bin var -sdist develop-eggs .installed.cfg diff --git a/package_deb_distr.sh b/package_distr_deb.sh similarity index 61% rename from package_deb_distr.sh rename to package_distr_deb.sh index 90da948..85d84a3 100755 --- a/package_deb_distr.sh +++ b/package_distr_deb.sh @@ -8,5 +8,13 @@ echo "##########################" python3 setup.py --command-packages=stdeb.command bdist_deb +echo "" +echo "" +echo "" +echo "############################" +echo "### Results - .deb files ###" +echo "############################" +ls -la ./deb_dist/*.deb + # pypi stdeb # apt install python3-all python3-stdeb diff --git a/package_rpm_distr.sh b/package_distr_rpm.sh similarity index 57% rename from package_rpm_distr.sh rename to package_distr_rpm.sh index 771e970..59b9cd6 100755 --- a/package_rpm_distr.sh +++ b/package_distr_rpm.sh @@ -2,11 +2,22 @@ ./package_clear_old.sh +echo "##########################" +echo "### Build RPM packages ###" +echo "##########################" + python3 setup.py bdist_rpm --packager="Vladislav Klimenko " # --spec-only -# ls -l ./build/bdist.linux-x86_64/rpm/SPECS/ -# ls -l ./dist/ +echo "" +echo "" +echo "" +echo "######################################" +echo "### Results - .spec and .rpm files ###" +echo "######################################" +ls -la ./build/bdist.linux-x86_64/rpm/SPECS/*.spec +ls -la ./dist/*.rpm + # build RPMs with # rpmbuild -ba ./build/bdist.linux-x86_64/rpm/SPECS/clickhouse-mysql.spec diff --git a/package_source_distr.sh b/package_distr_source.sh similarity index 100% rename from package_source_distr.sh rename to package_distr_source.sh diff --git a/package_wheels_distr.sh b/package_distr_wheels.sh similarity index 100% rename from package_wheels_distr.sh rename to package_distr_wheels.sh From 80bd93c1d08eea3b1a26b6811eabb64c2677d2f1 Mon Sep 17 00:00:00 2001 From: J Date: Sun, 1 Sep 2019 11:01:51 -0700 Subject: [PATCH 19/67] Port (--src-port) cli/config setting was being ignored --- clickhouse_mysql/dbclient/mysqlclient.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/clickhouse_mysql/dbclient/mysqlclient.py b/clickhouse_mysql/dbclient/mysqlclient.py index 9dc258a..0073bfe 100644 --- a/clickhouse_mysql/dbclient/mysqlclient.py +++ b/clickhouse_mysql/dbclient/mysqlclient.py @@ -58,6 +58,7 @@ def connect(self, db): try: self.connection = MySQLdb.connect( host=self.host, + port=self.port, user=self.user, passwd=self.password, db=db, @@ -66,15 +67,17 @@ def connect(self, db): use_unicode=True, ) self.cursor = self.connection.cursor() - logging.debug("Connect to the database host={} user={} password={} db={}".format( + logging.debug("Connect to the database host={} port={} user={} password={} db={}".format( self.host, + self.port, self.user, self.password, db )) except: - raise Exception("Can not connect to the database host={} user={} password={} db={}".format( + raise Exception("Can not connect to the database host={} port={} user={} password={} db={}".format( self.host, + self.port, self.user, self.password, db @@ -106,8 +109,9 @@ def tables_list(self, db): except Exception as err: logging.debug("Unexpected error: {}".format(str(err))) - raise Exception("Can not list tables on host={} user={} password={} db={}".format( + raise Exception("Can not list tables on host={} port={} user={} password={} db={}".format( self.host, + self.port, self.user, self.password, db From 6867044a3d57d82ab02358d073a8cf77f065e7dc Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 2 Sep 2019 12:52:46 +0300 Subject: [PATCH 20/67] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 60c7355..48b622a 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="clickhouse-mysql", # version should comply with PEP440 - version='0.0.20190821', + version='0.0.20190902', description='MySQL to ClickHouse data migrator', long_description='MySQL to ClickHouse data migrator', From 00f35f7e2470d54727c0aa704ab168e5e637b9b1 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 2 Sep 2019 12:58:58 +0300 Subject: [PATCH 21/67] add sysadmins to audience --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 48b622a..65da03e 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,8 @@ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'Topic :: Database', # should match license above From fbc5b5630955179951a857ab8b40375642070e0b Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 2 Sep 2019 13:01:01 +0300 Subject: [PATCH 22/67] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 65da03e..4d2e340 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="clickhouse-mysql", # version should comply with PEP440 - version='0.0.20190902', + version='0.0.20190903', description='MySQL to ClickHouse data migrator', long_description='MySQL to ClickHouse data migrator', From e54734f0b2275f62adb32c88d6e448cf3f2c9052 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 2 Sep 2019 13:13:51 +0300 Subject: [PATCH 23/67] write --src-tables-where-clauses explanation --- CHANGELOG.md | 11 ++++++++++- clickhouse_mysql/clioptions.py | 2 +- db.log_201801_1.sql | 1 - db.log_201801_2.sql | 1 - db.log_201801_3.sql | 1 - setup.py | 2 +- 6 files changed, 12 insertions(+), 6 deletions(-) delete mode 100644 db.log_201801_1.sql delete mode 100644 db.log_201801_2.sql delete mode 100644 db.log_201801_3.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index b4d6bea..fb68cc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +# clickhouse-mysql 2019-09-03 + +## improvements +* fix --src-tables-where-clauses to accept both filenames (for long where-clauses) and where-cluases themselves (for shorted clauses) + +## bugfixes +* fix --src-port CLI option +* ensure UTF8 for source migration + # clickhouse-mysql 2019-03-25 ## new features @@ -21,4 +30,4 @@ ## bugfixes * config files vs CLI options order fixed - \ No newline at end of file + diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 6d8085c..7dedf47 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -342,7 +342,7 @@ def options(self): '--src-tables-where-clauses', type=str, default=self.default_options['src_tables_where_clauses'], - help='Comma-separated list of WHERE clauses for tables to be migrated. Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4"' + help='Comma-separated list of WHERE clauses for tables to be migrated. Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4". Accepts both (comma-separated) clause (useful for short clauses) or file where clause is located (useful for long clauses)' ) argparser.add_argument( '--src-tables-prefixes', diff --git a/db.log_201801_1.sql b/db.log_201801_1.sql deleted file mode 100644 index d578dfe..0000000 --- a/db.log_201801_1.sql +++ /dev/null @@ -1 +0,0 @@ -id < 1727831 diff --git a/db.log_201801_2.sql b/db.log_201801_2.sql deleted file mode 100644 index b4d0459..0000000 --- a/db.log_201801_2.sql +++ /dev/null @@ -1 +0,0 @@ -id < 1727834 diff --git a/db.log_201801_3.sql b/db.log_201801_3.sql deleted file mode 100644 index 7ee37b5..0000000 --- a/db.log_201801_3.sql +++ /dev/null @@ -1 +0,0 @@ -id < 1727855 diff --git a/setup.py b/setup.py index 4d2e340..36c2e81 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="clickhouse-mysql", # version should comply with PEP440 - version='0.0.20190903', + version='0.0.20190904', description='MySQL to ClickHouse data migrator', long_description='MySQL to ClickHouse data migrator', From 17439420aee6eef3e91d84a5b3103c5f128d7ace Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Wed, 9 Oct 2019 06:32:35 +0300 Subject: [PATCH 24/67] fix tables migration --- clickhouse_mysql/clioptions.py | 93 ++++++++++++------- clickhouse_mysql/config.py | 27 ++++-- clickhouse_mysql/pumper.py | 3 +- clickhouse_mysql/reader/mysqlreader.py | 8 +- clickhouse_mysql/tablemigrator.py | 10 ++- clickhouse_mysql/tableprocessor.py | 53 ++++++++--- clickhouse_mysql/tablesqlbuilder.py | 95 ++++++++++++++------ clickhouse_mysql/writer/chcsvwriter.py | 14 +-- clickhouse_mysql/writer/chwriter.py | 21 +++-- clickhouse_mysql/writer/csvwriter.py | 14 +-- dev_run_cli_options_local_table_migration.sh | 35 ++++++-- 11 files changed, 260 insertions(+), 113 deletions(-) diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 7dedf47..4be23a2 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -16,25 +16,25 @@ def join_lists_into_dict(lists_to_join): [['a=b', 'c=d'], ['e=f', 'z=x'], ] :return: None or dictionary - {'a': 'b', 'c': 'd', 'e': 'f', 'z': 'x'} + {'a': 'b', 'c': 'd', 'e': 'f', 'y': 'z'} """ + # lists_to_join must be a list if not isinstance(lists_to_join, list): return None res = {} - for lst in lists_to_join: - # lst = ['a=b', 'c=d'] - for column_value_pair in lst: - # column_value_value = 'a=b' - column, value = column_value_pair.split('=', 2) - res[column] = value - # res = dict { - # 'col1': 'value1', - # 'col2': 'value2', + # 'name1': 'value1', + # 'name2': 'value2', # } + for _list in lists_to_join: + # _list = ['a=b', 'c=d'] + for name_value_pair in _list: + # name_value_pair contains 'a=b' + name, value = name_value_pair.split('=', 2) + res[name] = value # return with sanity check if len(res) > 0: @@ -52,6 +52,7 @@ def join_lists(lists_to_join): ['a', 'b', 'c', 'd', 'e', 'f'] """ + # lists_to_join must be a list if not isinstance(lists_to_join, list): return None @@ -145,6 +146,7 @@ class CLIOptions(Options): 'dst_distribute': False, 'dst_cluster': None, 'dst_table': None, + 'dst_table_prefix': None, 'dst_create_table': False, # @@ -191,13 +193,13 @@ def options(self): '--nice-pause', type=int, default=self.default_options['nice_pause'], - help='make nice pause between attempts to read binlog stream' + help='Make specified (in sec) pause between attempts to read binlog stream' ) argparser.add_argument( '--dry', action='store_true', help='Dry mode - do not do anything that can harm. ' - 'Useful for debugging.' + 'Useful for debugging.' ) argparser.add_argument( '--daemon', @@ -208,13 +210,13 @@ def options(self): '--pid-file', type=str, default=self.default_options['pid_file'], - help='Pid file to be used by app in daemon mode' + help='Pid file to be used by the app in daemon mode' ) argparser.add_argument( '--binlog-position-file', type=str, default=self.default_options['binlog_position_file'], - help='File to write binlog position to' + help='File to write binlog position to during bin log reading and to read position from on start' ) argparser.add_argument( '--mempool', @@ -242,7 +244,8 @@ def options(self): argparser.add_argument( '--csvpool', action='store_true', - help='Cache data in CSV pool files on disk. Requires memory pooling, thus enables --mempool even if it is not explicitly specified' + help='Cache data in CSV pool files on disk. Requires memory pooling, ' + 'thus enables --mempool even if it is not explicitly specified' ) argparser.add_argument( '--csvpool-file-path-prefix', @@ -278,14 +281,19 @@ def options(self): argparser.add_argument( '--migrate-table', action='store_true', - help='Migrate table(s). IMPORTANT!. Target table has to be created in ClickHouse ' - 'or it has to be created with --create-table and possibly with --with-create-database options' - 'See --table-template and --table-create options for additional info.' + help='Migrate table(s). Copy existing data from MySQL table(s) with SELECT statement. ' + 'Binlog is not read during this procedure - just copy data from the src table(s). ' + 'IMPORTANT!. Target table has to be created in ClickHouse ' + 'or it has to be created with --dst-create-table and possibly with --with-create-database options. ' + 'See --create-table-sql-template and --create-table-sql options for additional info. ' ) argparser.add_argument( '--pump-data', action='store_true', - help='Pump data into ClickHouse' + help='Pump data from MySQL binlog into ClickHouse. Copy rows from binlog until the end of binlog reached. ' + 'When end of binlog reached, process ends. ' + 'Use in combination with --src-wait in case would like to continue and wait for new rows ' + 'after end of binlog reached' ) argparser.add_argument( '--install', @@ -330,19 +338,25 @@ def options(self): '--src-schemas', type=str, default=self.default_options['src_schemas'], - help='Comma-separated list of schemas to be used when reading from src. Ex.: db1,db2,db3' + help='Comma-separated list of databases (a.k.a schemas) to be used when reading from src. Ex.: db1,db2,db3' ) argparser.add_argument( '--src-tables', type=str, default=self.default_options['src_tables'], - help='Comma-separated list of tables to be used when reading from src. Ex.: table1,table2,table3' + help='Comma-separated list of tables to be used when reading from src. ' + 'Ex.: table1,table2,table3' + 'Ex.: db1.table1,db2.table2,db3.table3' + 'Ex.: table1,db2.table2,table3' ) argparser.add_argument( '--src-tables-where-clauses', type=str, default=self.default_options['src_tables_where_clauses'], - help='Comma-separated list of WHERE clauses for tables to be migrated. Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4". Accepts both (comma-separated) clause (useful for short clauses) or file where clause is located (useful for long clauses)' + help='Comma-separated list of WHERE clauses for tables to be migrated. ' + 'Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4". ' + 'Accepts both (comma-separated) clause (useful for short clauses) or ' + 'file where clause is located (useful for long clauses)' ) argparser.add_argument( '--src-tables-prefixes', @@ -360,19 +374,21 @@ def options(self): argparser.add_argument( '--src-resume', action='store_true', - help='Resume reading from previous position.' + help='Resume reading from previous position. Previous position is read from `binlog-position-file`' ) argparser.add_argument( '--src-binlog-file', type=str, default=self.default_options['src_binlog_file'], - help='Binlog file to be used when reading from src. Ex.: mysql-bin.000024' + help='Binlog file to be used to read from src. Related to `binlog-position-file`. ' + 'Ex.: mysql-bin.000024' ) argparser.add_argument( '--src-binlog-position', type=int, default=self.default_options['src_binlog_position'], - help='Binlog position to be used when reading from src. Ex.: 5703' + help='Binlog position to be used when reading from src. Related to `binlog-position-file`. ' + 'Ex.: 5703' ) argparser.add_argument( '--src-file', @@ -418,19 +434,22 @@ def options(self): '--dst-schema', type=str, default=self.default_options['dst_schema'], - help='Database/schema to be used when writing to dst. Ex.: db1' + help='Database (a.k.a schema) to be used to create tables in ClickHouse. ' + 'It overwrites source database(s) name(s), so tables in ClickHouse ' + 'would be located in differently named db than in MySQL. ' + 'Ex.: db1' ) argparser.add_argument( '--dst-distribute', action='store_true', default=self.default_options['dst_distribute'], - help='is to add distribute table' + help='Whether to add distribute table' ) argparser.add_argument( '--dst-cluster', type=str, default=self.default_options['dst_cluster'], - help='Cluster to be used when writing to dst. Ex.: db1' + help='Cluster to be used when writing to dst. Ex.: cluster1' ) argparser.add_argument( '--dst-table', @@ -438,6 +457,12 @@ def options(self): default=self.default_options['dst_table'], help='Table to be used when writing to dst. Ex.: table1' ) + argparser.add_argument( + '--dst-table-prefix', + type=str, + default=self.default_options['dst_table_prefix'], + help='Prefix to be used when creating dst table. Ex.: copy_table_' + ) argparser.add_argument( '--dst-create-table', action='store_true', @@ -453,7 +478,8 @@ def options(self): nargs='*', action='append', default=self.default_options['column_default_value'], - help='Set of key=value pairs for columns default values. Ex.: date_1=2000-01-01 timestamp_1=2002-01-01\ 01:02:03' + help='Set of key=value pairs for columns default values. ' + 'Ex.: date_1=2000-01-01 timestamp_1=2002-01-01\ 01:02:03' ) argparser.add_argument( '--column-skip', @@ -535,6 +561,7 @@ def options(self): 'dst_distribute': args.dst_distribute, 'dst_cluster': args.dst_cluster, 'dst_table': args.dst_table, + 'dst_table_prefix': args.dst_table_prefix, 'dst_create_table': args.dst_create_table, # @@ -557,8 +584,8 @@ def options(filename): # def transform(section, key): - newkey = key.replace('-', '_') - section.rename(key, newkey) + new_key = key.replace('-', '_') + section.rename(key, new_key) # fetch base config try: @@ -567,7 +594,7 @@ def transform(section, key): encoding="utf-8", default_encoding="utf-8", list_values=True, - create_empty=False, # create empty config file + create_empty=False, # create empty config file stringify=True, raise_errors=False, file_error=False, @@ -582,7 +609,7 @@ def transform(section, key): encoding="utf-8", default_encoding="utf-8", list_values=True, - create_empty=False, # create empty config file + create_empty=False, # create empty config file stringify=True, raise_errors=False, file_error=False, diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 91d79d8..e4551c8 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -107,7 +107,7 @@ def __init__(self): 'port': self.options.get_int('src_port'), 'user': self.options['src_user'], 'password': self.options['src_password'], - 'dbs': self.options.get_list('src_schemas'), + 'schemas': self.options.get_list('src_schemas'), 'tables': self.options.get_list('src_tables'), 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 'column_skip': self.options['column_skip'] @@ -122,6 +122,8 @@ def __init__(self): 'dst_schema': self.options['dst_schema'], 'dst_distribute': self.options['dst_distribute'], 'dst_cluster': self.options['dst_cluster'], + 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, }, @@ -135,7 +137,7 @@ def __init__(self): 'port': self.options.get_int('src_port'), 'user': self.options['src_user'], 'password': self.options['src_password'], - 'dbs': self.options.get_list('src_schemas'), + 'schemas': self.options.get_list('src_schemas'), 'tables': self.options.get_list('src_tables'), 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 'tables_where_clauses': self.options.get_list('src_tables_where_clauses'), @@ -152,6 +154,7 @@ def __init__(self): 'dst_distribute': self.options['dst_distribute'], 'dst_cluster': self.options['dst_cluster'], 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], 'dst_create_table': self.options.get_bool('dst_create_table'), }, }, @@ -195,8 +198,9 @@ def __init__(self): 'password': self.options['dst_password'], }, 'dst_schema': self.options['dst_schema'], - 'dst_table': self.options['dst_table'], 'dst_distribute': self.options['dst_distribute'], + 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], }, 'file': { 'csv_file_path': self.options['dst_file'], @@ -204,7 +208,9 @@ def __init__(self): 'csv_file_path_suffix_parts': [], 'csv_keep_file': self.options['csvpool_keep_files'], 'dst_schema': self.options['dst_schema'], + 'dst_distribute': self.options['dst_distribute'], 'dst_table': self.options['dst_table'], + 'dst_table_prefix': self.options['dst_table_prefix'], }, }, } @@ -254,8 +260,10 @@ def table_sql_builder(self): port=self.config['table_builder']['mysql']['port'], user=self.config['table_builder']['mysql']['user'], password=self.config['table_builder']['mysql']['password'], - dbs=self.config['table_builder']['mysql']['dbs'], - schema=self.config['table_builder']['clickhouse']['dst_schema'], + dbs=self.config['table_builder']['mysql']['schemas'], + dst_schema=self.config['table_builder']['clickhouse']['dst_schema'], + dst_table=self.config['table_builder']['clickhouse']['dst_table'], + dst_table_prefix=self.config['table_builder']['clickhouse']['dst_table_prefix'], distribute=self.config['table_builder']['clickhouse']['dst_distribute'], cluster=self.config['table_builder']['clickhouse']['dst_cluster'], tables=self.config['table_builder']['mysql']['tables'], @@ -278,8 +286,10 @@ def table_migrator(self): port=self.config['table_migrator']['mysql']['port'], user=self.config['table_migrator']['mysql']['user'], password=self.config['table_migrator']['mysql']['password'], - dbs=self.config['table_migrator']['mysql']['dbs'], - schema=self.config['table_migrator']['clickhouse']['dst_schema'], + dbs=self.config['table_migrator']['mysql']['schemas'], + dst_schema=self.config['table_migrator']['clickhouse']['dst_schema'], + dst_table=self.config['table_builder']['clickhouse']['dst_table'], + dst_table_prefix=self.config['table_builder']['clickhouse']['dst_table_prefix'], distribute=self.config['table_migrator']['clickhouse']['dst_distribute'], cluster=self.config['table_migrator']['clickhouse']['dst_cluster'], tables=self.config['table_migrator']['mysql']['tables'], @@ -347,6 +357,7 @@ def writer_builder_csvpool(self): 'csv_keep_file': self.config['writer']['file']['csv_keep_file'], 'dst_schema': self.config['writer']['file']['dst_schema'], 'dst_table': self.config['writer']['file']['dst_table'], + 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 'next_writer_builder': ObjectBuilder( class_name=CHCSVWriter, constructor_params=self.config['writer']['clickhouse'] @@ -363,6 +374,7 @@ def writer_builder_csv_file(self): 'csv_keep_file': self.config['writer']['file']['csv_keep_file'], 'dst_schema': self.config['writer']['file']['dst_schema'], 'dst_table': self.config['writer']['file']['dst_table'], + 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 'next_writer_builder': None, 'converter_builder': self.converter_builder(CONVERTER_CSV), }) @@ -377,6 +389,7 @@ def writer_builder_chwriter(self): }, 'dst_schema': self.config['writer']['clickhouse']['dst_schema'], 'dst_table': self.config['writer']['clickhouse']['dst_table'], + 'dst_table_prefix': self.config['writer']['clickhouse']['dst_table_prefix'], 'dst_distribute': self.config['writer']['clickhouse']['dst_distribute'], 'next_writer_builder': None, 'converter_builder': self.converter_builder(CONVERTER_CH), diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index 959da6e..e75bc34 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import time class Pumper(object): """ @@ -20,7 +19,7 @@ def __init__(self, reader=None, writer=None): # subscribe on reader's event notifications self.reader.subscribe({ 'WriteRowsEvent': self.write_rows_event, -# 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, + # 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, 'ReaderIdleEvent': self.reader_idle_event, }) diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 96edbdd..f21e8b1 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -122,7 +122,7 @@ def __init__( only_tables=self.tables if not self.tables_prefixes else None, log_file=self.log_file, log_pos=self.log_pos, - freeze_schema=True, # If true do not support ALTER TABLE. It's faster. + freeze_schema=True, # If true do not support ALTER TABLE. It's faster. blocking=False, resume_stream=self.resume_stream, ) @@ -321,11 +321,15 @@ def read(self): self.stat_init_fetch_loop() try: + logging.debug('Pre-start binlog position: ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos) if self.binlog_stream.log_pos is not None else "undef") + # fetch available events from MySQL for mysql_event in self.binlog_stream: # new event has come # check what to do with it + logging.debug('Got Event ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos)) + # process event based on its type if isinstance(mysql_event, WriteRowsEvent): self.process_write_rows_event(mysql_event) @@ -363,7 +367,7 @@ def read(self): if not self.blocking: # do not wait for more data - all done - break # while True + break # while True # blocking - wait for more data if self.nice_pause > 0: diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 59f9871..750b8b0 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -38,7 +38,9 @@ def __init__( user=None, password=None, dbs=None, - schema=None, + dst_schema=None, + dst_table=None, + dst_table_prefix=None, distribute=None, cluster=None, tables=None, @@ -52,7 +54,9 @@ def __init__( user=user, password=password, dbs=dbs, - schema=schema, + dst_schema=dst_schema, + dst_table=dst_table, + dst_table_prefix=dst_table_prefix, distribute=distribute, cluster=cluster, tables=tables, @@ -103,7 +107,7 @@ def __init__( self.where_clauses[db] = {} if os.path.isfile(where_file_name): - self.wheres[db][table] = open(where_file_name,'r').read().strip("\n") + self.wheres[db][table] = open(where_file_name, 'r').read().strip("\n") else: self.wheres[db][table] = where_file_name diff --git a/clickhouse_mysql/tableprocessor.py b/clickhouse_mysql/tableprocessor.py index 486026b..7bb96b8 100644 --- a/clickhouse_mysql/tableprocessor.py +++ b/clickhouse_mysql/tableprocessor.py @@ -25,7 +25,9 @@ def __init__( user=None, password=None, dbs=None, - schema=None, + dst_schema=None, + dst_table=None, + dst_table_prefix=None, distribute=None, cluster=None, tables=None, @@ -37,7 +39,7 @@ def __init__( :param port: int MySQL port :param user: string MySQL user :param password: string MySQL password - :param dbs: list of string MySQL datatabse. May be omitted, in this case tables has to contain full table names, Ex.: db.table1 + :param dbs: list of string MySQL databases. May be omitted, in this case tables has to contain full table names, Ex.: db.table1 :param tables: list of string list of table names. Table names may be short or full form :param tables_prefixes: list of string list of table prefixes. May be short or full form """ @@ -50,7 +52,9 @@ def __init__( 'user': user, 'password': password, }) - self.schema = schema + self.dst_schema = dst_schema + self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix self.cluster = cluster self.distribute = distribute self.column_skip = column_skip @@ -59,7 +63,7 @@ def dbs_tables_lists(self): """ Prepare dict of databases and with list of tables for each db Include all tables into db tables list in case to tables are explicitly specified - It still can be no tables - incase db really has no tables + It still can be no tables - in case db really has no tables For convenient iteration over all tables :return: @@ -139,22 +143,42 @@ def tables_match(self, db, prefix): return res @staticmethod - def create_full_table_name(schema=None, db=None, table=None, distribute=None): + def create_full_table_name(dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, distribute=None): """ Create fully-specified table name as `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` + :param dst_schema: :param db: :param table: + :param distribute: :return: `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` """ - if schema != None: - if distribute: - schema += "_all" - table += "_all" - return '`{0}`.`{1}`'.format(schema, db+"__"+table) if db else '`{0}`'.format(table) - else: + + # target table can be renamed with dst_table + table = dst_table if dst_table is not None else table + + # simple case - do not move table into another db + if dst_schema is None: return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) + if distribute: + dst_schema += "_all" + table += "_all" + + return \ + '`{0}`.`{1}`'.format(dst_schema, TableProcessor.create_migrated_table_name(prefix=dst_table_prefix, table=table)) \ + if db else \ + '`{0}`'.format(table) + + @staticmethod + def create_migrated_table_name(prefix=None, table=None): + prefix = prefix if prefix is not None else "" + return prefix + table + + @staticmethod + def create_distributed_table_name(db=None, table=None): + return db + "__" + table + "_all" + @staticmethod def is_full_table_name(full_name): """ @@ -279,7 +303,9 @@ def extract_dbs(dbs=[], tables=[]): :param tables: list of tables with (otional) full names :return: set of db names """ - dbs_group = TableProcessor.group_tables(dbs=dbs, tables=tables, unsettled_tables_action=TableProcessor.ACTION_IGNORE_TABLE) + dbs_group = TableProcessor.group_tables(dbs=dbs, + tables=tables, + unsettled_tables_action=TableProcessor.ACTION_IGNORE_TABLE) return dbs_group.keys() @@ -290,7 +316,8 @@ def extract_tables(tables=[]): :param tables: list of (possibly) full names :return: set of short names """ - dbs_group = TableProcessor.group_tables(tables=tables, unsettled_tables_action=TableProcessor.ACTION_INCLUDE_TABLE) + dbs_group = TableProcessor.group_tables(tables=tables, + unsettled_tables_action=TableProcessor.ACTION_INCLUDE_TABLE) res = set() for db in dbs_group: res.update(dbs_group[db]) diff --git a/clickhouse_mysql/tablesqlbuilder.py b/clickhouse_mysql/tablesqlbuilder.py index 4cb90bc..77d10f2 100644 --- a/clickhouse_mysql/tablesqlbuilder.py +++ b/clickhouse_mysql/tablesqlbuilder.py @@ -5,6 +5,7 @@ from MySQLdb.cursors import Cursor import logging + class TableSQLBuilder(TableProcessor): """ Build ClickHouse table(s) @@ -17,10 +18,12 @@ def templates(self): :return: dict of ClickHouse's CREATE TABLE () templates { 'db1': { - 'table1': CREATE TABLE TABLE1 TEMPLATE, + 'table-db1-1': CREATE TABLE table1 statement template, + 'table-db1-2': CREATE TABLE table2 statement template, }, 'db2': { - 'table2': CREATE TABLE TABLE2 TEMPLATE, + 'table-db2-1': CREATE TABLE table1 statement template, + 'table-db2-2': CREATE TABLE table2 statement template, } } """ @@ -32,38 +35,57 @@ def templates(self): for db in dbs: templates[db] = {} for table in dbs[db]: - templates[db][table] = self.create_table_description(cluster=self.cluster, schema=self.schema, db=db, table=table) + templates[db][table] = self.create_table_description( + cluster=self.cluster, + dst_schema=self.dst_schema, + dst_table=self.dst_table, + dst_table_prefix=self.dst_table_prefix, + db=db, + table=table) return templates - def create_table_description(self, cluster=None, schema=None, db=None, table=None): + def create_table_description(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None): """ High-level function. Produce either text ClickHouse's table SQL CREATE TABLE() template or JSON ClikcHouse's table description :param db: string MySQL db name :param table: string MySQL table name - :param json: bool what shold return - json description or ClickHouse's SQL template + :param json: bool what should return - json description or ClickHouse's SQL template :return: dict{"template":SQL, "fields": {}} or string SQL """ columns_description = self.create_table_columns_description(db=db, table=table) return { - "create_table_template": self.create_table_sql_template(cluster=cluster, schema=schema, db=db, table=table, columns_description=columns_description), - "create_table": self.create_table_sql(cluster=cluster, schema=schema, db=db, table=table, columns_description=columns_description), - "create_database": self.create_database_sql(db=db), + "create_table_template": self.create_table_sql_template(cluster=cluster, + dst_schema=dst_schema, + dst_table=dst_table, + dst_table_prefix=dst_table_prefix, + db=db, + table=table, + columns_description=columns_description), + "create_table": self.create_table_sql(cluster=cluster, + dst_schema=dst_schema, + dst_table=dst_table, + dst_table_prefix=dst_table_prefix, + db=db, + table=table, + columns_description=columns_description), + "create_database": self.create_database_sql(dst_schema=dst_schema, db=db), "fields": columns_description, } - def create_table_sql_template(self, cluster=None, schema=None, db=None, table=None, columns_description=None): + def create_table_sql_template(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse - CREATE TABLE( + CREATE TABLE schema.table ( ... columns specification ... ) ENGINE = MergeTree(_, (), 8192) for specified MySQL's table - :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template + :param db: string - name of the DB in MySQL + :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template :return: string - almost-ready-to-use ClickHouse CREATE TABLE statement """ @@ -76,13 +98,13 @@ def create_table_sql_template(self, cluster=None, schema=None, db=None, table=No ) ENGINE = MergeTree(, (), 8192) """.format( - self.create_full_table_name(schema=schema, db=db, table=table), - "on cluster {}".format(cluster) if cluster != None else "", + self.create_full_table_name(dst_schema=dst_schema, dst_table=dst_table, dst_table_prefix=dst_table_prefix, db=db, table=table), + "on cluster {}".format(cluster) if cluster is not None else "", ",\n ".join(ch_columns), ) return sql - def create_table_sql(self, cluster=None, schema=None, db=None, table=None, columns_description=None): + def create_table_sql(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, columns_description=None): """ Produce table template for ClickHouse CREATE TABLE( @@ -91,8 +113,9 @@ def create_table_sql(self, cluster=None, schema=None, db=None, table=None, colum ... ) ENGINE = MergeTree(PRIMARY DATE FIELD, (COMMA SEPARATED INDEX FIELDS LIST), 8192) for specified MySQL's table - :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template + :param db: string - name of the DB in MySQL + :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template :return: string - ready-to-use ClickHouse CREATE TABLE statement """ @@ -122,23 +145,28 @@ def create_table_sql(self, cluster=None, schema=None, db=None, table=None, colum ) {} """.format( - self.create_full_table_name(schema=schema, db=db, table=table, distribute=self.distribute), - "on cluster {}".format(cluster) if not self.distribute and cluster != None else "", + self.create_full_table_name(dst_schema=dst_schema, dst_table=dst_table, dst_table_prefix=dst_table_prefix, db=db, table=table, distribute=self.distribute), + "on cluster {}".format(cluster) if not self.distribute and cluster is not None else "", ",\n ".join(ch_columns), - self.create_table_engine(self.cluster, self.schema, db+"__"+table, primary_date_field, ",".join(primary_key_fields), self.distribute), - + self.create_table_engine(self.cluster, + self.dst_schema, + self.create_migrated_table_name(prefix=dst_table_prefix, table=dst_table) if dst_table is not None else self.create_migrated_table_name(prefix=dst_table_prefix, table=table), + primary_date_field, + ",".join(primary_key_fields), + self.distribute) ) return sql - def create_database_sql(self, db): + def create_database_sql(self, dst_schema=None, db=None): """ Produce create database statement for ClickHouse CREATE DATABASE for specified MySQL's db - :param db: string - name of the DB in MySQL + + :param db: string - name of the DB :return: string - ready-to-use ClickHouse CREATE DATABASE statement """ - sql = "CREATE DATABASE IF NOT EXISTS `{}`".format(db) + sql = "CREATE DATABASE IF NOT EXISTS `{}`".format(dst_schema if dst_schema is not None else db) return sql def create_table_columns_description(self, db=None, table=None, ): @@ -190,9 +218,9 @@ def fetch_primary_date_field(self, columns_description): :return: string|None """ for column_description in columns_description: - if (column_description['clickhouse_type'] == 'Date'): + if column_description['clickhouse_type'] == 'Date': return column_description['field'] - if (column_description['clickhouse_type'] == 'DateTime'): + if column_description['clickhouse_type'] == 'DateTime': return column_description['field'] return None @@ -338,8 +366,23 @@ def map_type_nullable(self, mysql_type, nullable=False): return ch_type - def create_table_engine(self, cluster=None, dst_schema=None, dst_table=None,primary_date_field=None, primary_key_fields=None, distribute=None): - if distribute : + def create_table_engine(self, + cluster=None, + dst_schema=None, + dst_table=None, + primary_date_field=None, + primary_key_fields=None, + distribute=None): + """ + :param cluster: + :param dst_schema: + :param dst_table: + :param primary_date_field: + :param primary_key_fields: + :param distribute: + :return: + """ + if distribute: return "ENGINE = Distributed({}, '{}', '{}', rand())".format( cluster, dst_schema, diff --git a/clickhouse_mysql/writer/chcsvwriter.py b/clickhouse_mysql/writer/chcsvwriter.py index 8b8ff00..aa4e8a0 100644 --- a/clickhouse_mysql/writer/chcsvwriter.py +++ b/clickhouse_mysql/writer/chcsvwriter.py @@ -6,6 +6,7 @@ import logging from clickhouse_mysql.writer.writer import Writer +from clickhouse_mysql.tableprocessor import TableProcessor class CHCSVWriter(Writer): @@ -25,6 +26,7 @@ def __init__( connection_settings, dst_schema=None, dst_table=None, + dst_table_prefix=None, dst_distribute=False, ): if dst_distribute and dst_schema is not None: @@ -38,6 +40,7 @@ def __init__( self.password = connection_settings['password'] self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix self.dst_distribute = dst_distribute def insert(self, event_or_events=None): @@ -62,13 +65,12 @@ def insert(self, event_or_events=None): for event in events: schema = self.dst_schema if self.dst_schema else event.schema table = None - if self.dst_table: - table = self.dst_table - elif self.dst_distribute: - # if current is going to insert distributed table,we need '_all' suffix - table = event.schema + "__" + event.table + "_all" + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) else: - table = event.schema + "__" + event.table + table = self.dst_table if self.dst_table else event.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( schema, diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 5638795..587d48f 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -9,6 +9,7 @@ from clickhouse_mysql.dbclient.chclient import CHClient from clickhouse_mysql.writer.writer import Writer +from clickhouse_mysql.tableprocessor import TableProcessor from clickhouse_mysql.event.event import Event @@ -25,6 +26,7 @@ def __init__( connection_settings, dst_schema=None, dst_table=None, + dst_table_prefix=None, dst_distribute=False, next_writer_builder=None, converter_builder=None, @@ -33,10 +35,12 @@ def __init__( dst_schema += "_all" if dst_distribute and dst_table is not None: dst_table += "_all" - logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={} dst_distribute={}".format(connection_settings, dst_schema, dst_table, dst_distribute)) + logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={} dst_distribute={}".format( + connection_settings, dst_schema, dst_table, dst_distribute)) self.client = CHClient(connection_settings) self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix self.dst_distribute = dst_distribute def insert(self, event_or_events=None): @@ -71,7 +75,7 @@ def insert(self, event_or_events=None): for row in event_converted: for key in row.keys(): # we need to convert Decimal value to str value for suitable for table structure - if (type(row[key]) == Decimal): + if type(row[key]) == Decimal: row[key] = str(row[key]) rows.append(row) @@ -81,13 +85,13 @@ def insert(self, event_or_events=None): schema = self.dst_schema if self.dst_schema else event_converted.schema table = None - if self.dst_table: - table = self.dst_table - elif self.dst_distribute: - # if current is going to insert distributed table,we need '_all' suffix - table = event_converted.schema + "__" + event_converted.table + "_all" + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) else: - table = event_converted.schema + "__" + event_converted.table + table = self.dst_table if self.dst_table else event_converted.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) # and INSERT converted rows @@ -109,7 +113,6 @@ def insert(self, event_or_events=None): # all DONE - if __name__ == '__main__': connection_settings = { 'host': '192.168.74.230', diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index eba0928..4ff9081 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -34,16 +34,17 @@ def __init__( csv_keep_file=False, dst_schema=None, dst_table=None, + dst_table_prefix=None, next_writer_builder=None, converter_builder=None, ): logging.info("CSVWriter() " - "csv_file_path={} " - "csv_file_path_prefix={} " - "csv_file_path_suffix_parts={} " - "csv_keep_file={} " - "dst_schema={} " - "dst_table={} ".format( + "csv_file_path={} " + "csv_file_path_prefix={} " + "csv_file_path_suffix_parts={} " + "csv_keep_file={} " + "dst_schema={} " + "dst_table={} ".format( csv_file_path, csv_file_path_prefix, csv_file_path_suffix_parts, @@ -58,6 +59,7 @@ def __init__( self.path_suffix_parts = csv_file_path_suffix_parts self.dst_schema = dst_schema self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix if self.path is None: if not self.path_suffix_parts: diff --git a/dev_run_cli_options_local_table_migration.sh b/dev_run_cli_options_local_table_migration.sh index bcd984a..3ac6d53 100755 --- a/dev_run_cli_options_local_table_migration.sh +++ b/dev_run_cli_options_local_table_migration.sh @@ -19,24 +19,47 @@ if [ ! -d "clickhouse_mysql" ]; then cd .. fi +MYSQL_USER=reader +MYSQL_PASSWORD=qwerty +SRC_TABLES=test.books +DST_SCHEMA=test +DST_TABLE=books + +MYSQL_USER=user1 +MYSQL_PASSWORD=qwerty +SRC_TABLES=repl.foo +DST_SCHEMA=repl1 +DST_TABLE=foo1 + $PYTHON $CH_MYSQL ${*:1} \ --src-server-id=1 \ --nice-pause=1 \ --log-level=debug \ + \ --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --src-tables=test.books \ + --src-user="${MYSQL_USER}" \ + --src-password="${MYSQL_PASSWORD}" \ + --src-tables="${SRC_TABLES}" \ + \ --dst-host=127.0.0.1 \ - --dst-schema=test \ - --dst-table=books \ + --dst-create-table \ + --with-create-database \ + \ --csvpool \ --csvpool-file-path-prefix=qwe_ \ --mempool-max-flush-interval=60 \ --mempool-max-events-num=10000 \ + \ + --binlog-position-file=qwe.txt \ --pump-data \ - --migrate-table + --migrate-table \ + --src-wait \ + --src-resume +# --dst-schema="${DST_SCHEMA}" \ +# --dst-table="${DST_TABLE}" \ +# --dst-table="${DST_SCHEMA}.${DST_TABLE}" \ +# --dst-table-prefix="pr1_" \ # --log-file=ontime.log \ # --mempool # --mempool-max-events-num=3 From 1a0f7cfb6fcd2344df0033f5683662f8b69daf31 Mon Sep 17 00:00:00 2001 From: Sorin Dumitrescu Date: Mon, 4 Nov 2019 15:07:24 +0200 Subject: [PATCH 25/67] Escape column names used in MySQL select statements --- clickhouse_mysql/tablemigrator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 750b8b0..84f095e 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -231,7 +231,8 @@ def get_columns(self,db,full_table_name): if self.column_skip.__contains__(_field): logging.debug("skip column %s",_field) continue - fields.append(_field) + fields.append('`{}`'.format(_field)) + return fields if __name__ == '__main__': From 6100f20e517429bd7b3eb50c6159dcbe958d7e54 Mon Sep 17 00:00:00 2001 From: Mihai Chitic Date: Wed, 22 Jan 2020 13:54:16 +0200 Subject: [PATCH 26/67] set exit code 1 for run too --- clickhouse_mysql/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/clickhouse_mysql/main.py b/clickhouse_mysql/main.py index bab32eb..662fd4d 100644 --- a/clickhouse_mysql/main.py +++ b/clickhouse_mysql/main.py @@ -153,6 +153,7 @@ def run(self): traceback.print_exc(file=sys.stdout) print('=============') print(ex) + sys.exit(1); def start(self): if self.config.is_daemon(): From a194979121bd5f279dbcd009885fd5b2e5a497b6 Mon Sep 17 00:00:00 2001 From: Kimmo Mustonen Date: Mon, 27 Jan 2020 11:31:50 +0100 Subject: [PATCH 27/67] fix wheres -> where_clauses --- clickhouse_mysql/tablemigrator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/tablemigrator.py b/clickhouse_mysql/tablemigrator.py index 84f095e..07e0986 100644 --- a/clickhouse_mysql/tablemigrator.py +++ b/clickhouse_mysql/tablemigrator.py @@ -107,9 +107,9 @@ def __init__( self.where_clauses[db] = {} if os.path.isfile(where_file_name): - self.wheres[db][table] = open(where_file_name, 'r').read().strip("\n") + self.where_clauses[db][table] = open(where_file_name, 'r').read().strip("\n") else: - self.wheres[db][table] = where_file_name + self.where_clauses[db][table] = where_file_name # debug info logging.info("migration where clauses") From 811c59e3bfe01223a6ad6dbfe1fc1f04c07c4618 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Wed, 29 Jan 2020 16:29:00 +0300 Subject: [PATCH 28/67] dev: add shell quotation --- clickhouse_mysql/writer/chcsvwriter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clickhouse_mysql/writer/chcsvwriter.py b/clickhouse_mysql/writer/chcsvwriter.py index aa4e8a0..caea56e 100644 --- a/clickhouse_mysql/writer/chcsvwriter.py +++ b/clickhouse_mysql/writer/chcsvwriter.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import os -import time import logging +import shlex from clickhouse_mysql.writer.writer import Writer from clickhouse_mysql.tableprocessor import TableProcessor @@ -80,13 +80,13 @@ def insert(self, event_or_events=None): choptions = "" if self.host: - choptions += " --host=" + self.host + choptions += " --host=" + shlex.quote(self.host) if self.port: choptions += " --port=" + str(self.port) if self.user: - choptions += " --user=" + self.user + choptions += " --user=" + shlex.quote(self.user) if self.password: - choptions += " --password=" + self.password + choptions += " --password=" + shlex.quote(self.password) bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( event.filename, choptions, From 6ffc91a03de0b369f4d419ba34aeb44ee9a7ea5a Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Wed, 29 Jan 2020 16:35:54 +0300 Subject: [PATCH 29/67] new release --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 36c2e81..f5be528 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ name="clickhouse-mysql", # version should comply with PEP440 - version='0.0.20190904', + version='0.0.20200128', description='MySQL to ClickHouse data migrator', long_description='MySQL to ClickHouse data migrator', @@ -20,7 +20,7 @@ license="MIT", - # see https://pypi.python.org/pypi?%3Aaction=list_classifiers + # see https://pypi.python.org/pypi?:action=list_classifiers classifiers=[ # How mature is this project? Common values are # 3 - Alpha From 1c6b52b8533be8cdbd7653efdc81e6569cdcb72a Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Wed, 29 Jan 2020 16:48:59 +0300 Subject: [PATCH 30/67] docs: fix src-tables --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index be2b7be..44e92a2 100644 --- a/README.md +++ b/README.md @@ -388,7 +388,7 @@ clickhouse-mysql \ --src-password=Qwerty1# \ --create-table-sql-template \ --with-create-database \ - --src-only-table=airline.ontime > create_clickhouse_table_template.sql + --src-tables=airline.ontime > create_clickhouse_table_template.sql ``` We have **CREATE TABLE** template stored in `create_clickhouse_table_template.sql` file. ```bash @@ -425,7 +425,7 @@ clickhouse-mysql \ --src-user=reader \ --src-password=Qwerty1# \ --migrate-table \ - --src-only-table=airline.ontime \ + --src-tables=airline.ontime \ --dst-host=127.0.0.1 ``` This may take some time. @@ -451,8 +451,9 @@ clickhouse-mysql \ --src-wait \ --nice-pause=1 \ --src-host=127.0.0.1 \ - --src-user=reader --src-password=Qwerty1# \ - --src-only-table=airline.ontime \ + --src-user=reader \ + --src-password=Qwerty1# \ + --src-tables=airline.ontime \ --dst-host=127.0.0.1 \ --csvpool \ --csvpool-file-path-prefix=qwe_ \ From d14e556299603b3771c9cf5a6d2ba09e7781a7a1 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 24 Feb 2020 11:42:19 +0300 Subject: [PATCH 31/67] docs: add references --- docs/manual.md | 1247 ++++++++++++++++++++++++++++++++++++++ docs/usage-references.md | 19 + 2 files changed, 1266 insertions(+) create mode 100644 docs/manual.md create mode 100644 docs/usage-references.md diff --git a/docs/manual.md b/docs/manual.md new file mode 100644 index 0000000..1451bcf --- /dev/null +++ b/docs/manual.md @@ -0,0 +1,1247 @@ +# Table of Contents + + * [Introduction](#introduction) + * [Requirements and Installation](#requirements-and-installation) + * [Dev Installation](#dev-installation) + * [RPM Installation](#rpm-installation) + * [PyPi Installation](#pypi-installation) + * [GitHub-based Installation - Clone Sources](#github-based-installation---clone-sources) + * [MySQL setup](#mysql-setup) + * [Quick Start](#quick-start) + * [Operation](#operation) + * [Requirements and Limitations](#requirements-and-limitations) + * [Operation General Schema](#operation-general-schema) + * [Performance](#performance) + * [Examples](#examples) + * [Base Example](#base-example) + * [MySQL Migration Case 1 - with Tables Lock](#mysql-migration-case-1---with-tables-lock) + * [MySQL Migration Case 1 - Create ClickHouse Table](#mysql-migration-case-1---create-clickhouse-table) + * [MySQL Migration Case 1 - Migrate Existing Data](#mysql-migration-case-1---migrate-existing-data) + * [MySQL Migration Case 1 - Listen For New Data](#mysql-migration-case-1---listen-for-new-data) + * [MySQL Migration Case 2 - without Tables Lock](#mysql-migration-case-2---without-tables-lock) + * [MySQL Migration Case 2 - Create ClickHouse Table](#mysql-migration-case-2---create-clickhouse-table) + * [MySQL Migration Case 2 - Listen For New Data](#mysql-migration-case-2---listen-for-new-data) + * [MySQL Migration Case 2 - Migrate Existing Data](#mysql-migration-case-2---migrate-existing-data) + * [airline.ontime Test Case](#airlineontime-test-case) + * [airline.ontime Data Set in CSV files](#airlineontime-data-set-in-csv-files) + * [airline.ontime MySQL Table](#airlineontime-mysql-table) + * [airline.ontime ClickHouse Table](#airlineontime-clickhouse-table) + * [airline.ontime Data Reader](#airlineontime-data-reader) + * [airline.ontime Data Importer](#airlineontime-data-importer) + * [Testing](#testing) + * [Testing General Schema](#testing-general-schema) + * [MySQL Data Types](#mysql-data-types) + * [ClickHouse Data Types](#clickhouse-data-types) + * [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) + * [MySQL Test Tables](#mysql-test-tables) + * [ClickHouse Test Tables](#clickhouse-test-tables) + +--- + +# Introduction + +Utility to import data into ClickHouse from MySQL (mainly) and/or CSV files + +# Requirements and Installation + +Datareader requires at least **Python 3.4** with additional modules to be installed. +In most distributions Python 3 have `pip` utility named as `pip3`, so we'll use this naming. +However, you may have it called differently. + +Datareader can be installed either from `github` repo or from `pypi` repo. + +## Dev Installation +```bash +sudo yum install -y rpm-build +sudo yum install -y epel-release +sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm +curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash + +sudo yum install -y python34-pip python34-devel python34-setuptools + +./package_rpm_distr.sh +./pack/build.sh +ls -l ./build/bdist.linux-x86_64/rpm/RPMS/noarch/ +sudo yum install ./build/bdist.linux-x86_64/rpm/RPMS/noarch/clickhouse-mysql-* +``` + +## RPM Installation +**Tested on CentOS 7** + +Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) +More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) +```bash +curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash +``` +Install EPEL (for `python3`) and MySQL (for `libmysqlclient`) repos +```bash +sudo yum install -y epel-release +sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm +``` + +If you do not have EPEL available in your repos, install it directly from EPEL site +```bash +sudo yum install -y https://download.fedoraproject.org/pub/epel/7/x86_64/Packages/e/epel-release-7-11.noarch.rpm +``` + +Install data reader from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) +```bash +sudo yum install -y clickhouse-mysql +``` +clickhouse packages would also be installed as dependencies. + +Prepare config file - copy **example** file into production and edit it. +```bash +sudo cp /etc/clickhouse-mysql/clickhouse-mysql-example.conf /etc/clickhouse-mysql/clickhouse-mysql.conf +sudo vim /etc/clickhouse-mysql/clickhouse-mysql.conf +``` + +Start service +```bash +sudo service clickhouse-mysql start +``` + +## PyPi Installation +In case you need just to use the app - this is the most convenient way to go. + +Install dependencies. +MySQL repo (for `mysql-community-devel`) +```bash +sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm +``` +epel (for `python3`) +```bash +sudo yum install -y epel-release +``` + +clickhouse-client (for `clickhouse-client`) from Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) +More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) +```bash +curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash +``` +```bash +sudo yum install -y clickhouse-client +``` + +and direct dependencies: +```bash +sudo yum install -y mysql-community-devel +sudo yum install -y mariadb-devel +sudo yum install -y gcc +sudo yum install -y python34-devel python34-pip +``` + +Install data reader +```bash +sudo pip3 install clickhouse-mysql +``` + +Now we are able to call datareader as an app - perform last installation steps - install service files, etc +```bash +[user@localhost ~]$ which clickhouse-mysql +/usr/bin/clickhouse-mysql +/usr/bin/clickhouse-mysql --install +``` + +## GitHub-based Installation - Clone Sources +In case you'd like to play around with the sources this is the way to go. + +Install dependencies: + +`MySQLdb` package is used for communication with MySQL: +```bash +pip3 install mysqlclient +``` + +`mysql-replication` package is used for communication with MySQL also: +[https://github.com/noplay/python-mysql-replication](https://github.com/noplay/python-mysql-replication) +```bash +pip3 install mysql-replication +``` + +`clickhouse-driver` package is used for communication with ClickHouse: +[https://github.com/mymarilyn/clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) +```bash +pip3 install clickhouse-driver +``` + +Clone sources from github +```bash +git clone https://github.com/Altinity/clickhouse-mysql-data-reader +``` + +## MySQL setup + +Also the following (at least one of) MySQL privileges are required for this operation: `SUPER`, `REPLICATION CLIENT` + +```mysql +CREATE USER 'reader'@'%' IDENTIFIED BY 'qwerty'; +CREATE USER 'reader'@'127.0.0.1' IDENTIFIED BY 'qwerty'; +CREATE USER 'reader'@'localhost' IDENTIFIED BY 'qwerty'; +GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'%'; +GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'127.0.0.1'; +GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'localhost'; +FLUSH PRIVILEGES; +``` + +Also the following MySQL config options are required: +```ini +[mysqld] +# mandatory +server-id = 1 +log_bin = /var/lib/mysql/bin.log +binlog-format = row # very important if you want to receive write, update and delete row events +# optional +expire_logs_days = 30 +max_binlog_size = 768M +# setup listen address +bind-address = 0.0.0.0 +``` + +# Quick Start + +Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) and want to migrate it into ClickHouse. + +Steps to do: + + * Setup MySQL access as described in [MySQL setup](#mysql-setup) + * Run data reader as following: + +```bash +clickhouse-mysql \ + --src-server-id=1 \ + --src-wait \ + --nice-pause=1 \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --src-tables=airline.ontime \ + --dst-host=127.0.0.1 \ + --dst-create-table \ + --migrate-table \ + --pump-data \ + --csvpool +``` + +Expected results are: + * automatically create target table in ClickHouse (if possible) + * migrate existing data from MySQL to ClickHouse + * after migration completed, listen for new events to come and pump data from MySQL into ClickHouse + +Options description + * `--src-server-id` - Master's server id + * `--src-wait` - wait for new data to come + * `--nice-pause=1` - when no data available sleep for 1 second + * `--src-host=127.0.0.1` - MySQL source host + * `--src-user=reader` - MySQL source user (remember about PRIVILEGES for this user) + * `--src-password=qwerty` - MySQL source password (remember about PRIVILEGES for this user) + * `--src-tables=airline.ontime` - list of MySQL source tables to process + * `--dst-host=127.0.0.1` - ClickHouse host + * `--dst-create-table` - create target table automatically + * `--migrate-table` - migrate source tables + * `--pump-data` - pump data from MySQL into ClickHouse after data migrated + * `--csvpool` - make pool of csv files while pumping data (assumes `--mempool` also) + +Choose any combination of `--pump-data`, `--migrate-table`, `--create-table-sql`, `--dst-create-table` + +# Operation + +## Requirements and Limitations + +Data reader understands INSERT SQL statements only. In practice this means that: + * You need to create required table in ClickHouse before starting data read procedure. More on how to create target ClickHouse table: [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) + * From all DML statements INSERT-only are handled, which means: + * UPDATE statements are not handled - meaning UPDATEs within MySQL would not be relayed into ClickHouse + * DELETE statements are not handled - meaning DELETEs within MySQL would not be relayed into ClickHouse + * DDL statements are not handled, which means: + * source table structure change (ALTER TABLE) has to be handled externally and can lead to insertion errors + +## Operation General Schema + + * Step 1. Data Reader reads data from the source event-by-event (for MySQL binlog) or line-by-line (file). + * Step 2. **OPTIONAL** Caching in memory pool. Since ClickHouse prefers to get data in bundles (row-by-row insertion is extremely slow), we need to introduce some caching. + Cache can be flushed by either of: + * number of rows in cache + * number of events in cache + * time elapsed + * data source depleted + * Step 3. **OPTIONAL** Writing CSV file. Sometimes it is useful to have data also represented as a file + * Step 4. Writing data into ClickHouse. Depending on the configuration of the previous steps data are written into ClickHouse by either of: + * directly event-by-event or line-by-line + * from memory cache as a bulk insert operation + * from CSV file via `clickhouse-client` + +## Performance + +`pypy` significantly improves performance. You should try it. Really. Up to **10 times performance boost** can be achieved. +For example you can start with [Portable PyPy distribution for Linux](https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) + - use [Python 3.x release](https://github.com/squeaky-pl/portable-pypy#latest-python-35-release) +Unpack it into your place of choice. + +```bash +[user@localhost ~]$ ls -l pypy3.5-5.9-beta-linux_x86_64-portable +total 32 +drwxr-xr-x 2 user user 140 Oct 24 01:14 bin +drwxr-xr-x 5 user user 4096 Oct 3 11:57 include +drwxr-xr-x 4 user user 4096 Oct 3 11:57 lib +drwxr-xr-x 13 user user 4096 Oct 3 11:56 lib_pypy +drwxr-xr-x 3 user user 15 Oct 3 11:56 lib-python +-rw-r--r-- 1 user user 11742 Oct 3 11:56 LICENSE +-rw-r--r-- 1 user user 1296 Oct 3 11:56 README.rst +drwxr-xr-x 14 user user 4096 Oct 24 01:16 site-packages +drwxr-xr-x 2 user user 195 Oct 3 11:57 virtualenv_support +``` + +Install `pip` +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -m ensurepip +``` +Install required modules +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysql-replication +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install clickhouse-driver +``` +`mysqlclient` may require to install `libmysqlclient-dev` and `gcc` +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysqlclient +``` +Install them if need be +```bash +sudo apt-get install libmysqlclient-dev +``` +```bash +sudo apt-get install gcc +``` + +Now you can run data reader via `pypy` +```bash +/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy clickhouse-mysql +``` + +# Examples + +## Base Example + +Let's walk over test example of tool launch command line options. +This code snippet is taken from shell script (see more details in [airline.ontime Test Case](#airlineontime-test-case)) + +```bash +$PYTHON clickhouse-mysql ${*:1} \ + --src-server-id=1 \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --log-level=info \ + --log-file=ontime.log \ + --src-host=127.0.0.1 \ + --src-user=root \ + --dst-host=127.0.0.1 \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=1000 \ + --pump-data +``` +Options description + * `--src-server-id` - Master's server id + * `--src-resume` - resume data loading from the previous point. When the tool starts - resume from the end of the log + * `--src-wait` - wait for new data to come + * `--nice-pause=1` - when no data available sleep for 1 second + * `--log-level=info` - log verbosity + * `--log-file=ontime.log` - log file name + * `--src-host=127.0.0.1` - MySQL source host + * `--src-user=root` - MySQL source user (remember about PRIVILEGES for this user) + * `--dst-host=127.0.0.1` - ClickHouse host + * `--csvpool` - make pool of csv files (assumes `--mempool` also) + * `--csvpool-file-path-prefix=qwe_` - put these CSV files having `qwe_` prefix in `CWD` + * `--mempool-max-flush-interval=60` - flush mempool at least every 60 seconds + * `--mempool-max-events-num=1000` - flush mempool at least each 1000 events (not rows, but events) + * `--pump-data` - pump data from MySQL into ClickHouse + +## MySQL Migration Case 1 - with Tables Lock + +Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) with multiple rows: + +```mysql +mysql> SELECT COUNT(*) FROM airline.ontime; ++----------+ +| count(*) | ++----------+ +| 7694964 | ++----------+ +``` + +MySQL is already configured as [described earlier](#mysql-setup). +Let's migrate existing data to ClickHouse and listen for newly coming data in order to migrate them to CLickHouse on-the-fly. + +### MySQL Migration Case 1 - Create ClickHouse Table + +Create ClickHouse table description +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=Qwerty1# \ + --create-table-sql-template \ + --with-create-database \ + --src-tables=airline.ontime > create_clickhouse_table_template.sql +``` +We have **CREATE TABLE** template stored in `create_clickhouse_table_template.sql` file. +```bash +vim create_clickhouse.sql +``` +Setup sharding field and primary key. These columns must not be `Nullable` +```bash mysql +...cut... + `Year` UInt16, +...cut... + `FlightDate` Date, +...cut... + `Month` UInt8, +...cut... +) ENGINE = MergeTree(FlightDate, (FlightDate, Year, Month), 8192) +``` + +Create table in ClickHouse +```bash +clickhouse-client -mn < create_clickhouse_table_template.sql +``` + +### MySQL Migration Case 1 - Migrate Existing Data + +Lock MySQL in order to avoid new data coming while data migration is running. Keep `mysql` client open during the whole process +```mysql +mysql> FLUSH TABLES WITH READ LOCK; +``` + +Migrate data +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=Qwerty1# \ + --migrate-table \ + --src-tables=airline.ontime \ + --dst-host=127.0.0.1 +``` +This may take some time. +Check all data is in ClickHouse +```mysql +:) select count(*) from airline.ontime; + +SELECT count(*) +FROM airline.ontime + +┌─count()─┐ +│ 7694964 │ +└─────────┘ +``` + +### MySQL Migration Case 1 - Listen For New Data + +Start `clickhouse-mysql` as a replication slave, so it will listen for new data coming: +```bash +clickhouse-mysql \ + --src-server-id=1 \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=Qwerty1# \ + --src-tables=airline.ontime \ + --dst-host=127.0.0.1 \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=10000 \ + --pump-data +``` + +Allow new data to be inserted into MySQL - i.e. unlock tables. + +```mysql +mysql> UNLOCK TABLES; +``` + +Insert some data into MySQL. For example, via [clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) script + +```mysql +mysql> SELECT COUNT(*) FROM airline.ontime; ++----------+ +| count(*) | ++----------+ +| 10259952 | ++----------+ +``` + +Replication will be pumping data from MySQL into ClickHouse in background and in some time we'll see the following picture in ClickHouse: +```mysql +:) select count(*) from airline.ontime; + +SELECT count(*) +FROM airline.ontime + +┌──count()─┐ +│ 10259952 │ +└──────────┘ +``` + +## MySQL Migration Case 2 - without Tables Lock +Suppose we'd like to migrate multiple log tables of the same structure named as `log_XXX` - i.e. all of them have `log_` name prefix +into one ClickHouse table named `logunified` of the following structure +```sql +DESCRIBE TABLE logunified + +┌─name─┬─type───┬─default_type─┬─default_expression─┐ +│ id │ UInt64 │ │ │ +│ day │ Date │ │ │ +│ str │ String │ │ │ +└──────┴────────┴──────────────┴────────────────────┘ +``` +Log tables by nature are `INSERT`-only tables. Let's migrate these tables. + +### MySQL Migration Case 2 - Create ClickHouse Table +Prepare tables templates in `create_clickhouse.sql` file +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --create-table-sql-template \ + --with-create-database \ + --src-tables-prefixes=db.log_ > create_clickhouse_table_template.sql +``` +Edit templates +```bash +vim create_clickhouse_table_template.sql +``` +And create tables in ClickHouse +```bash + +clickhouse-client -mn < create_clickhouse_table_template.sql +``` + +### MySQL Migration Case 2 - Listen For New Data +```bash +clickhouse-mysql \ + --src-server-id=1 \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --log-level=info \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --src-tables-prefixes=log_ \ + --dst-host=127.0.0.1 \ + --dst-table=logunified \ + --csvpool \ + --pump-data +``` +Pay attention to +```bash + --src-tables-prefixes=log_ \ + --dst-table=logunified \ +``` +Replication data from multiple tables into one destination table `--dst-table=logunified`. + +Monitor logs for `first row in replication` notification of the following structure: +```bash +INFO:first row in replication db.log_201801_2 +column: id=1727834 +column: day=2018-01-20 +column: str=data event 3 +``` +These records help us to create SQL statement for Data Migration process. +Sure, we can peek into MySQL database manually in order to understand what records would be the last to be copied by migration process. + +### MySQL Migration Case 2 - Migrate Existing Data + +```bash +clickhouse-mysql \ + --src-host=127.0.0.1 \ + --src-user=reader \ + --src-password=qwerty \ + --migrate-table \ + --src-tables-prefixes=db.log_ \ + --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ + --dst-host=127.0.0.1 \ + --dst-table=logunified \ + --csvpool +``` + +Pay attention to +```bash + --src-tables-prefixes=db.log_ \ + --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ + --dst-table=logunified \ +``` +Migration subset of data described in `--src-tables-where-clauses` files from multiple tables into one destination table `--dst-table=logunified` + +Values for where clause in `db.log_201801_1.sql` are fetched from `first row in replication` log: `INFO:first row in replication db.log_201801_1` +```bash +cat db.log_201801_1.sql +id < 1727831 +``` + +Result: +```sql +:) select count(*) from logunified; + +SELECT count(*) +FROM logunified + +┌──count()─┐ +│ 12915568 │ +└──────────┘ + +``` + +## airline.ontime Test Case + +Main Steps + * Download airline.ontime dataset + * Create airline.ontime MySQL table + * Create airline.ontime ClickHouse table + * Start data reader (utility to migrate data MySQL -> ClickHouse) + * Start data importer (utility to import data into MySQL) + * Check how data are loaded into ClickHouse + +### airline.ontime Data Set in CSV files +Run [download script](clickhouse_mysql_examples/airline_ontime_data_download.sh) + +You may want to adjust dirs where to keep `ZIP` and `CSV` file + +In `airline_ontime_data_download.sh` edit these lines: +```bash +... +ZIP_FILES_DIR="zip" +CSV_FILES_DIR="csv" +... +``` +You may want to adjust number of files to download (In case downloading all it may take some time). + +Specify year and months range as you wish: +```bash +... +echo "Download files into $ZIP_FILES_DIR" +for year in `seq 1987 2017`; do + for month in `seq 1 12`; do +... +``` + +```bash +./airline_ontime_data_download.sh +``` +Downloading can take some time. + +### airline.ontime MySQL Table +Create MySQL table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql): +```bash +mysql -uroot -p < clickhouse_mysql_examples/airline_ontime_schema_mysql.sql +``` + +### airline.ontime ClickHouse Table +Create ClickHouse table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_ch.sql](clickhouse_mysql_examples/airline_ontime_schema_ch.sql): +```bash +clickhouse-client -mn < clickhouse_mysql_examples/airline_ontime_schema_ch.sql +``` + +### airline.ontime Data Reader +Run [datareader script](clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh) + +You may want to adjust `PYTHON` path and source and target hosts and usernames +```bash +... +PYTHON=python3.6 +PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy +... +``` +```bash +... + --src-host=127.0.0.1 \ + --src-user=root \ + --dst-host=127.0.0.1 \ +... +``` +```bash +./airline_ontime_data_mysql_to_ch_reader.sh +``` + +### airline.ontime Data Importer +Run [data importer script](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) + +You may want to adjust `CSV` files location, number of imported files and MySQL user/password used for import +```bash +... +# looking for csv files in this dir +FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" + +# limit import to this number of files +FILES_TO_IMPORT_NUM=3 +... +``` +```bash +... + -u root \ +... +``` + +```bash +./airline_ontime_mysql_data_import.sh +``` + +# Testing + +## Testing General Schema + +### MySQL Data Types + +#### Numeric Types + + * `BIT` the number of bits per value, from 1 to 64 + * `TINYINT` -128 to 127. The unsigned range is 0 to 255 + * `BOOL`, `BOOLEAN` synonyms for `TINYINT(1)` + * `SMALLINT` -32768 to 32767. The unsigned range is 0 to 65535 + * `MEDIUMINT` -8388608 to 8388607. The unsigned range is 0 to 16777215. + * `INT`, `INTEGER` -2147483648 to 2147483647. The unsigned range is 0 to 4294967295 + * `BIGINT` -9223372036854775808 to 9223372036854775807. The unsigned range is 0 to 18446744073709551615 + + * `SERIAL` is an alias for `BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE`. + * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` A packed ?exact? fixed-point number + * `FLOAT` Permissible values are -3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38 + * `DOUBLE`, `REAL` Permissible values are -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308 + + +#### Date and Time Types + + * `DATE` The supported range is '1000-01-01' to '9999-12-31' + * `DATETIME` The supported range is '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.999999' + * `TIMESTAMP` The range is '1970-01-01 00:00:01.000000' UTC to '2038-01-19 03:14:07.999999' + * `TIME` The range is '-838:59:59.000000' to '838:59:59.000000' + * `YEAR` Values display as 1901 to 2155, and 0000 + +#### String Types + * `CHAR` The range of M is 0 to 255. If M is omitted, the length is 1. + * `VARCHAR` The range of M is 0 to 65,535 + * `BINARY` similar to CHAR + * `VARBINARY` similar to VARCHAR + * `TINYBLOB` maximum length of 255 + * `TINYTEXT` maximum length of 255 + * `BLOB` maximum length of 65,535 + * `TEXT` maximum length of 65,535 + * `MEDIUMBLOB` maximum length of 16,777,215 + * `MEDIUMTEXT` maximum length of 16,777,215 + * `LONGBLOB` maximum length of 4,294,967,295 or 4GB + * `LONGTEXT` maximum length of 4,294,967,295 or 4GB + * `ENUM` can have a maximum of 65,535 distinct elements + * `SET` can have a maximum of 64 distinct members + + * `JSON` native JSON data type defined by RFC 7159 + +--- + +### ClickHouse Data Types + + * `Date` number of days since 1970-01-01 + * `DateTime` Unix timestamp + * `Enum8` or `Enum16`. A set of enumerated string values that are stored as `Int8` or `Int16`. The numeric values must be within -128..127 for Enum8 and -32768..32767 for Enum16 + * `Float32`, `Float64` + + * `Int8` -128 127 + * `UInt8` 0 255 + + * `Int16` -32768 32767 + * `UInt16` 0 65535 + + * `Int32` -2147483648 2147483647 + * `UInt32` 0 4294967295 + + * `Int64` -9223372036854775808 9223372036854775807 + * `UInt64` 0 18446744073709551615 + + * `FixedString(N)` string of `N` bytes (not characters or code points) + * `String` The length is not limited. The value can contain an arbitrary set of bytes, including null bytes + +--- + +### MySQL -> ClickHouse Data Types Mapping + +#### Numeric Types + + * `BIT` -> ??? (possibly `String`?) + * `TINYINT` -> `Int8`, `UInt8` + * `BOOL`, `BOOLEAN` -> `UInt8` + * `SMALLINT` -> `Int16`, `UInt16` + * `MEDIUMINT` -> `Int32`, `UInt32` + * `INT`, `INTEGER` -> `Int32`, `UInt32` + * `BIGINT` -> `Int64`, `UInt64` + + * `SERIAL` -> `UInt64` + * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` -> ???? (possibly `String`?) + * `FLOAT` -> `Float32` + * `DOUBLE`, `REAL` -> `Float64` + + +#### Date and Time Types + + * `DATE` -> `Date` (for valid values) or `String` + `Date` Allows storing values from just after the beginning of the Unix Epoch + to the upper threshold defined by a constant at the compilation stage + (currently, this is until the year 2038, but it may be expanded to 2106) + * `DATETIME` -> `DateTime` (for valid values) or `String` + * `TIMESTAMP` -> `DateTime` + * `TIME` -> ????? (possibly `String`?) + * `YEAR` -> `UInt16` + + +#### String Types + + * `CHAR` -> `FixedString` + * `VARCHAR` -> `String` + * `BINARY` -> `String` + * `VARBINARY` -> `String` + * `TINYBLOB` -> `String` + * `TINYTEXT` -> `String` + * `BLOB` -> `String` + * `TEXT` -> `String` + * `MEDIUMBLOB` -> `String` + * `MEDIUMTEXT` -> `String` + * `LONGBLOB` -> `String` + * `LONGTEXT` -> `String` + +#### Set Types + * `ENUM` -> `Enum8`, `Enum16` + * `SET` -> `Array(Int8)` + +#### Custom Types + * `JSON` -> ?????? (possibly `String`?) + + +### MySQL Test Tables + +We have to separate test table into several ones because of this error, produced by MySQL: +```text +ERROR 1118 (42000): Row size too large. The maximum row size for the used table type, not counting BLOBs, is 65535. This includes storage overhead, check the manual. You have to change some columns to TEXT or BLOBs +``` + +```mysql +CREATE TABLE datatypes( + + bit_1 BIT(1), + bit_2 BIT(64), + + tinyint_1 TINYINT COMMENT '-128 to 127', + u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 BOOL, + bool_2 BOOLEAN, + + smallint_1 SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 INT COMMENT '-2147483648 to 2147483647', + u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', + timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', + time_1 TIME COMMENT '-838:59:59 to 838:59:59', + year_1 YEAR COMMENT '1901 to 2155, and 0000', + + char_0 CHAR(0), + char_1 CHAR(1), + char_2 CHAR(255), + + varchar_0 VARCHAR(0), + varchar_1 VARCHAR(1), + + binary_0 BINARY(0) COMMENT 'similar to CHAR', + binary_1 BINARY(1) COMMENT 'similar to CHAR', + binary_2 BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' +) +; + +CREATE TABLE enum_datatypes( + enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' +) +; + +CREATE TABLE set_datatypes( + set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members' +) +; + +CREATE TABLE json_datatypes( + json_1 JSON +) +; + +CREATE TABLE long_varchar_datatypes( + varchar_2 VARCHAR(65532) +) +; + +CREATE TABLE long_varbinary_datatypes( + varbinary_2 VARBINARY(65532) COMMENT 'similar to VARCHAR' +) +; +``` + + +```mysql +-- in order to be able to set timestamp = '1970-01-01 00:00:01' +set time_zone='+00:00'; +``` + +Insert minimal acceptable values into the test table: + +```mysql +-- MIN values +INSERT INTO datatypes SET + + bit_1 = 0b0, -- BIT(1), + bit_2 = 0b0, -- BIT(64), + + tinyint_1 = -128, -- TINYINT COMMENT '-128 to 127', + u_tinyint_1 = 0, -- TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 = FALSE, -- BOOL, + bool_2 = FALSE, -- BOOLEAN, + + smallint_1 = -32768, -- SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 = 0, -- SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 = -8388608, -- MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 = 0, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 = -2147483648, -- INT COMMENT '-2147483648 to 2147483647', + u_int_1 = 0, -- INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 = -2147483648, -- INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 = 0, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 = -9223372036854775808, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 = 0, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 = 0, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 = -9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 = -9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 = -9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 = -9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 = -3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 = 0, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 = -1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 = 0, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 = -1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 = 0, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 = '1970-01-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 = '1970-01-01 00:00:00', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', + timestamp_1 = '1970-01-01 00:00:01', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', + time_1 = '-838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', + year_1 = 1901, -- YEAR COMMENT '1901 to 2155, and 0000', + + char_0 = '', -- CHAR(0), + char_1 = '', -- CHAR(1), + char_2 = '', -- CHAR(255), + + varchar_0 = '', -- VARCHAR(0), + varchar_1 = '', -- VARCHAR(1), + + binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', + binary_1 = '', -- BINARY(1) COMMENT 'similar to CHAR', + binary_2 = '', -- BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 = '', -- VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 = '', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 = '', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 = '', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 = '', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 = '', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 = '', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 = '', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 = '' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' +; + +INSERT INTO enum_datatypes SET + enum_1 = NULL -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' +; + +INSERT INTO set_datatypes SET + set_1 = '' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members' +; + +INSERT INTO json_datatypes SET + json_1 = '{}' -- JSON +; + +INSERT INTO long_varchar_datatypes SET + varchar_2 = "" +; + +INSERT INTO long_varbinary_datatypes SET + varbinary_2 = "" +; +``` + +Insert maximum acceptable values into the test table: + +```mysql +-- MAX values +INSERT INTO datatypes SET + + bit_1 = 0b1, -- BIT(1), + bit_2 = 0b1111111111111111111111111111111111111111111111111111111111111111, -- BIT(64), + + tinyint_1 = 127, -- TINYINT COMMENT '-128 to 127', + u_tinyint_1 = 255, -- TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 = TRUE, -- BOOL, + bool_2 = TRUE, -- BOOLEAN, + + smallint_1 = 32767, -- SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 = 65535, -- SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 = 8388607, -- MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 = 16777215, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 = 2147483647, -- INT COMMENT '-2147483648 to 2147483647', + u_int_1 = 4294967295, -- INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 = 2147483647, -- INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 = 4294967295, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 = 9223372036854775807, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 = 18446744073709551615, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 = 18446744073709551615, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 = 9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 = 9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 = 9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 = 9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 = 3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 = 3.402823466E+38, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 = 1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 = 1.7976931348623157E+308, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 = 1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 = 1.7976931348623157E+308, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 = '2149-06-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 = '2106-02-01 23:59:59', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', + timestamp_1 = '2038-01-19 03:14:07', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', + time_1 = '838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', + year_1 = 2155, -- YEAR COMMENT '1901 to 2155, and 0000', + + char_0 = '', -- CHAR(0), + char_1 = 'a', -- CHAR(1), + char_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- CHAR(255), + + varchar_0 = '', -- VARCHAR(0), + varchar_1 = 'a', -- VARCHAR(1), + + binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', + binary_1 = 'a', -- BINARY(1) COMMENT 'similar to CHAR', + binary_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 = 'a', -- VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 = 'a', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 = 'a', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 = 'a', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 = 'a', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 = 'a', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 = 'a', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 = 'a', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 = 'a' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' +; + +INSERT INTO enum_datatypes SET + enum_1 = 'a' -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' +; + +INSERT INTO set_datatypes SET + set_1 = 'a,b,c' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members', +; + +INSERT INTO json_datatypes SET + json_1 = '{"a":1, "b":2, "c":3}' -- JSON +; + +INSERT INTO long_varchar_datatypes SET + varchar_2 = "abc" +; + +INSERT INTO long_varbinary_datatypes SET + varbinary_2 = "abc" +; +``` + +### ClickHouse Test Tables + +```sql +CREATE TABLE datatypes( + bit_1 Nullable(String), -- bit_1 BIT(1), + bit_2 Nullable(String), -- bit_2 BIT(64), + + tinyint_1 Nullable(Int8), -- tinyint_1 TINYINT COMMENT '-128 to 127', + u_tinyint_1 Nullable(UInt8), -- u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', + + bool_1 Nullable(UInt8), -- bool_1 BOOL, + bool_2 Nullable(UInt8), -- bool_2 BOOLEAN, + + smallint_1 Nullable(Int16), -- smallint_1 SMALLINT COMMENT '-32768 to 32767', + u_smallint_1 Nullable(UInt16), -- u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', + + mediumint_1 Nullable(Int32), -- mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', + u_mediumint_1 Nullable(UInt32), -- u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', + + int_1 Nullable(Int32), -- int_1 INT COMMENT '-2147483648 to 2147483647', + u_int_1 Nullable(UInt32), -- u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', + + integer_1 Nullable(Int32), -- integer_1 INTEGER COMMENT '-2147483648 to 2147483647', + u_integer_1 Nullable(UInt32), -- u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', + + bigint_1 Nullable(Int64), -- bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', + u_bigint_1 Nullable(UInt64), -- u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', + + serial_1 Nullable(UInt64), -- serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', + + decimal_1 Nullable(String), -- decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', + dec_1 Nullable(String), -- dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', + fixed_1 Nullable(String), -- fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', + numeric_1 Nullable(String), -- numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', + + float_1 Nullable(Float32), -- float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', + u_float_1 Nullable(Float32), -- u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', + + double_1 Nullable(Float64), -- double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_double_1 Nullable(Float64), -- u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + real_1 Nullable(Float64), -- real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + u_real_1 Nullable(Float64), -- u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', + + date_1 Nullable(Date), -- date_1 DATE COMMENT '1000-01-01 to 9999-12-31', + datetime_1 Nullable(DateTime), -- datetime_1 DATETIME COMMENT '1000-01-01 00:00:00.000000 to 9999-12-31 23:59:59.999999', + timestamp_1 Nullable(DateTime), -- timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01.000000 UTC to 2038-01-19 03:14:07.999999 UTC', + time_1 Nullable(String), -- time_1 TIME COMMENT '-838:59:59.000000 to 838:59:59.000000', + year_1 Nullable(UInt16), -- year_1 YEAR COMMENT '1901 to 2155, and 0000', + + char_0 Nullable(FixedString(1)), -- char_0 CHAR(0), + char_1 Nullable(FixedString(1)), -- char_1 CHAR(1), + char_2 Nullable(FixedString(255)), -- char_2 CHAR(255), + + varchar_0 Nullable(String), -- varchar_0 VARCHAR(0), + varchar_1 Nullable(String), -- varchar_1 VARCHAR(1), + + binary_0 Nullable(String), -- binary_0 BINARY(0) COMMENT 'similar to CHAR', + binary_1 Nullable(String), -- binary_1 BINARY(1) COMMENT 'similar to CHAR', + binary_2 Nullable(String), -- binary_2 BINARY(255) COMMENT 'similar to CHAR', + + varbinary_0 Nullable(String), -- varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', + varbinary_1 Nullable(String), -- varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', + + tinyblob_1 Nullable(String), -- tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', + tinytext_1 Nullable(String), -- tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', + + blob_1 Nullable(String), -- blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', + text_1 Nullable(String), -- text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', + + mediumblob_1 Nullable(String), -- mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', + mediumtext_1 Nullable(String), -- mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', + + longblob_1 Nullable(String), -- longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', + longtext_1 Nullable(String) -- longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters', + +) ENGINE = Log +; + +CREATE TABLE enum_datatypes( + enum_1 Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6) -- enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements', +) ENGINE = Memory +; + +CREATE TABLE set_datatypes( + set_1 Array(Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6)) -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', +) ENGINE = Memory +; + +CREATE TABLE set_datatypes( + set_1 String -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', +) ENGINE = Memory +; + + +CREATE TABLE json_datatypes( + json_1 String -- json_1 JSON +) ENGINE = Memory +; + +CREATE TABLE long_varchar_datatypes( + varchar_2 String +) ENGINE = Memory +; + +CREATE TABLE long_varbinary_datatypes( + varbinary_2 String +) ENGINE = Memory +; +``` diff --git a/docs/usage-references.md b/docs/usage-references.md new file mode 100644 index 0000000..195c75b --- /dev/null +++ b/docs/usage-references.md @@ -0,0 +1,19 @@ +## Use Cases +- [3 Step Migration of MySQL data to Clickhouse for faster analytics.](https://mydbops.wordpress.com/2020/02/21/3-step-migration-of-mysql-data-to-clickhouse-for-faster-analytics/) +- [Hybrid OLTP/Analytics Database Workloads: Replicating MySQL Data to ClickHouse](https://severalnines.com/database-blog/hybrid-oltpanalytics-database-workloads-replicating-mysql-data-clickhouse) +- [How to import and replicate data from MySQL toClickHouse](https://minervadb.com/wp-content/uploads/2019/10/How-to-import-and-replicate-data-from-MySQL-to-ClickHouse.pdf) +- [Use Yandex ClickHouse for Analytics with Data from MySQL](https://www.tienle.com/2018/05-04/use-yandex-clickhouse-for-analytics-with-data-from-mysql.html) + +## Talks +- [Opensource Column Store Databases: MariaDB ColumnStore vs. ClickHouse](https://www.percona.com/live/19/sites/default/files/slides/Opensource%20Column%20Store%20Databases_%20MariaDB%20ColumnStore%20vs.%20ClickHouse%20-%20FileId%20-%20188040.pdf) +- [Replicating MySQL Data to TiDB For Near Real-Time Analytics](https://dataops.barcelona/wp-content/uploads/2019/06/Replicating-to-TiDb-francisco-Bordenave.pdf) + +## TODOs and HOWTOs +- [Clickhouse install and use /clickhouse-mysql installation](http://www.programmersought.com/article/7079240138/) +- [Replication from MySQL to ClickHouse](https://www.goplardb.com/post/replication-from-mysql-to-clickhouse) + + +## Other References +- [CH integrations](https://clickhouse.tech/docs/en/interfaces/third-party/integrations/) +- [awesomeopensource](https://awesomeopensource.com/projects/clickhouse) + From b4ec875445628060e6ec4d0a7dafa7d769f20bb3 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 24 Feb 2020 11:42:42 +0300 Subject: [PATCH 32/67] docs: move manual to docs and add references section --- README.md | 1251 +---------------------------------------------------- 1 file changed, 2 insertions(+), 1249 deletions(-) diff --git a/README.md b/README.md index 44e92a2..123796a 100644 --- a/README.md +++ b/README.md @@ -1,1251 +1,4 @@ # clickhouse-mysql-data-reader ---- - -# Table of Contents - - * [Introduction](#introduction) - * [Requirements and Installation](#requirements-and-installation) - * [Dev Installation](#dev-installation) - * [RPM Installation](#rpm-installation) - * [PyPi Installation](#pypi-installation) - * [GitHub-based Installation - Clone Sources](#github-based-installation---clone-sources) - * [MySQL setup](#mysql-setup) - * [Quick Start](#quick-start) - * [Operation](#operation) - * [Requirements and Limitations](#requirements-and-limitations) - * [Operation General Schema](#operation-general-schema) - * [Performance](#performance) - * [Examples](#examples) - * [Base Example](#base-example) - * [MySQL Migration Case 1 - with Tables Lock](#mysql-migration-case-1---with-tables-lock) - * [MySQL Migration Case 1 - Create ClickHouse Table](#mysql-migration-case-1---create-clickhouse-table) - * [MySQL Migration Case 1 - Migrate Existing Data](#mysql-migration-case-1---migrate-existing-data) - * [MySQL Migration Case 1 - Listen For New Data](#mysql-migration-case-1---listen-for-new-data) - * [MySQL Migration Case 2 - without Tables Lock](#mysql-migration-case-2---without-tables-lock) - * [MySQL Migration Case 2 - Create ClickHouse Table](#mysql-migration-case-2---create-clickhouse-table) - * [MySQL Migration Case 2 - Listen For New Data](#mysql-migration-case-2---listen-for-new-data) - * [MySQL Migration Case 2 - Migrate Existing Data](#mysql-migration-case-2---migrate-existing-data) - * [airline.ontime Test Case](#airlineontime-test-case) - * [airline.ontime Data Set in CSV files](#airlineontime-data-set-in-csv-files) - * [airline.ontime MySQL Table](#airlineontime-mysql-table) - * [airline.ontime ClickHouse Table](#airlineontime-clickhouse-table) - * [airline.ontime Data Reader](#airlineontime-data-reader) - * [airline.ontime Data Importer](#airlineontime-data-importer) - * [Testing](#testing) - * [Testing General Schema](#testing-general-schema) - * [MySQL Data Types](#mysql-data-types) - * [ClickHouse Data Types](#clickhouse-data-types) - * [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) - * [MySQL Test Tables](#mysql-test-tables) - * [ClickHouse Test Tables](#clickhouse-test-tables) - ---- - -# Introduction - -Utility to import data into ClickHouse from MySQL (mainly) and/or CSV files - -# Requirements and Installation - -Datareader requires at least **Python 3.4** with additional modules to be installed. -In most distributions Python 3 have `pip` utility named as `pip3`, so we'll use this naming. -However, you may have it called differently. - -Datareader can be installed either from `github` repo or from `pypi` repo. - -## Dev Installation -```bash -sudo yum install -y rpm-build -sudo yum install -y epel-release -sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm -curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash - -sudo yum install -y python34-pip python34-devel python34-setuptools - -./package_rpm_distr.sh -./pack/build.sh -ls -l ./build/bdist.linux-x86_64/rpm/RPMS/noarch/ -sudo yum install ./build/bdist.linux-x86_64/rpm/RPMS/noarch/clickhouse-mysql-* -``` - -## RPM Installation -**Tested on CentOS 7** - -Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) -More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) -```bash -curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash -``` -Install EPEL (for `python3`) and MySQL (for `libmysqlclient`) repos -```bash -sudo yum install -y epel-release -sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm -``` - -If you do not have EPEL available in your repos, install it directly from EPEL site -```bash -sudo yum install -y https://download.fedoraproject.org/pub/epel/7/x86_64/Packages/e/epel-release-7-11.noarch.rpm -``` - -Install data reader from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) -```bash -sudo yum install -y clickhouse-mysql -``` -clickhouse packages would also be installed as dependencies. - -Prepare config file - copy **example** file into production and edit it. -```bash -sudo cp /etc/clickhouse-mysql/clickhouse-mysql-example.conf /etc/clickhouse-mysql/clickhouse-mysql.conf -sudo vim /etc/clickhouse-mysql/clickhouse-mysql.conf -``` - -Start service -```bash -sudo service clickhouse-mysql start -``` - -## PyPi Installation -In case you need just to use the app - this is the most convenient way to go. - -Install dependencies. -MySQL repo (for `mysql-community-devel`) -```bash -sudo yum install -y https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm -``` -epel (for `python3`) -```bash -sudo yum install -y epel-release -``` - -clickhouse-client (for `clickhouse-client`) from Packagecloud repo from [packagecloud.io](https://packagecloud.io/Altinity/clickhouse) -More details on installation are available on [https://github.com/Altinity/clickhouse-rpm-install](https://github.com/Altinity/clickhouse-rpm-install) -```bash -curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash -``` -```bash -sudo yum install -y clickhouse-client -``` - -and direct dependencies: -```bash -sudo yum install -y mysql-community-devel -sudo yum install -y mariadb-devel -sudo yum install -y gcc -sudo yum install -y python34-devel python34-pip -``` - -Install data reader -```bash -sudo pip3 install clickhouse-mysql -``` - -Now we are able to call datareader as an app - perform last installation steps - install service files, etc -```bash -[user@localhost ~]$ which clickhouse-mysql -/usr/bin/clickhouse-mysql -/usr/bin/clickhouse-mysql --install -``` - -## GitHub-based Installation - Clone Sources -In case you'd like to play around with the sources this is the way to go. - -Install dependencies: - -`MySQLdb` package is used for communication with MySQL: -```bash -pip3 install mysqlclient -``` - -`mysql-replication` package is used for communication with MySQL also: -[https://github.com/noplay/python-mysql-replication](https://github.com/noplay/python-mysql-replication) -```bash -pip3 install mysql-replication -``` - -`clickhouse-driver` package is used for communication with ClickHouse: -[https://github.com/mymarilyn/clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) -```bash -pip3 install clickhouse-driver -``` - -Clone sources from github -```bash -git clone https://github.com/Altinity/clickhouse-mysql-data-reader -``` - -## MySQL setup - -Also the following (at least one of) MySQL privileges are required for this operation: `SUPER`, `REPLICATION CLIENT` - -```mysql -CREATE USER 'reader'@'%' IDENTIFIED BY 'qwerty'; -CREATE USER 'reader'@'127.0.0.1' IDENTIFIED BY 'qwerty'; -CREATE USER 'reader'@'localhost' IDENTIFIED BY 'qwerty'; -GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'%'; -GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'127.0.0.1'; -GRANT SELECT, REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'localhost'; -FLUSH PRIVILEGES; -``` - -Also the following MySQL config options are required: -```ini -[mysqld] -# mandatory -server-id = 1 -log_bin = /var/lib/mysql/bin.log -binlog-format = row # very important if you want to receive write, update and delete row events -# optional -expire_logs_days = 30 -max_binlog_size = 768M -# setup listen address -bind-address = 0.0.0.0 -``` - -# Quick Start - -Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) and want to migrate it into ClickHouse. - -Steps to do: - - * Setup MySQL access as described in [MySQL setup](#mysql-setup) - * Run data reader as following: - -```bash -clickhouse-mysql \ - --src-server-id=1 \ - --src-wait \ - --nice-pause=1 \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --src-tables=airline.ontime \ - --dst-host=127.0.0.1 \ - --dst-create-table \ - --migrate-table \ - --pump-data \ - --csvpool -``` - -Expected results are: - * automatically create target table in ClickHouse (if possible) - * migrate existing data from MySQL to ClickHouse - * after migration completed, listen for new events to come and pump data from MySQL into ClickHouse - -Options description - * `--src-server-id` - Master's server id - * `--src-wait` - wait for new data to come - * `--nice-pause=1` - when no data available sleep for 1 second - * `--src-host=127.0.0.1` - MySQL source host - * `--src-user=reader` - MySQL source user (remember about PRIVILEGES for this user) - * `--src-password=qwerty` - MySQL source password (remember about PRIVILEGES for this user) - * `--src-tables=airline.ontime` - list of MySQL source tables to process - * `--dst-host=127.0.0.1` - ClickHouse host - * `--dst-create-table` - create target table automatically - * `--migrate-table` - migrate source tables - * `--pump-data` - pump data from MySQL into ClickHouse after data migrated - * `--csvpool` - make pool of csv files while pumping data (assumes `--mempool` also) - -Choose any combination of `--pump-data`, `--migrate-table`, `--create-table-sql`, `--dst-create-table` - -# Operation - -## Requirements and Limitations - -Data reader understands INSERT SQL statements only. In practice this means that: - * You need to create required table in ClickHouse before starting data read procedure. More on how to create target ClickHouse table: [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) - * From all DML statements INSERT-only are handled, which means: - * UPDATE statements are not handled - meaning UPDATEs within MySQL would not be relayed into ClickHouse - * DELETE statements are not handled - meaning DELETEs within MySQL would not be relayed into ClickHouse - * DDL statements are not handled, which means: - * source table structure change (ALTER TABLE) has to be handled externally and can lead to insertion errors - -## Operation General Schema - - * Step 1. Data Reader reads data from the source event-by-event (for MySQL binlog) or line-by-line (file). - * Step 2. **OPTIONAL** Caching in memory pool. Since ClickHouse prefers to get data in bundles (row-by-row insertion is extremely slow), we need to introduce some caching. - Cache can be flushed by either of: - * number of rows in cache - * number of events in cache - * time elapsed - * data source depleted - * Step 3. **OPTIONAL** Writing CSV file. Sometimes it is useful to have data also represented as a file - * Step 4. Writing data into ClickHouse. Depending on the configuration of the previous steps data are written into ClickHouse by either of: - * directly event-by-event or line-by-line - * from memory cache as a bulk insert operation - * from CSV file via `clickhouse-client` - -## Performance - -`pypy` significantly improves performance. You should try it. Really. Up to **10 times performance boost** can be achieved. -For example you can start with [Portable PyPy distribution for Linux](https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) - - use [Python 3.x release](https://github.com/squeaky-pl/portable-pypy#latest-python-35-release) -Unpack it into your place of choice. - -```bash -[user@localhost ~]$ ls -l pypy3.5-5.9-beta-linux_x86_64-portable -total 32 -drwxr-xr-x 2 user user 140 Oct 24 01:14 bin -drwxr-xr-x 5 user user 4096 Oct 3 11:57 include -drwxr-xr-x 4 user user 4096 Oct 3 11:57 lib -drwxr-xr-x 13 user user 4096 Oct 3 11:56 lib_pypy -drwxr-xr-x 3 user user 15 Oct 3 11:56 lib-python --rw-r--r-- 1 user user 11742 Oct 3 11:56 LICENSE --rw-r--r-- 1 user user 1296 Oct 3 11:56 README.rst -drwxr-xr-x 14 user user 4096 Oct 24 01:16 site-packages -drwxr-xr-x 2 user user 195 Oct 3 11:57 virtualenv_support -``` - -Install `pip` -```bash -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -m ensurepip -``` -Install required modules -```bash -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysql-replication -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install clickhouse-driver -``` -`mysqlclient` may require to install `libmysqlclient-dev` and `gcc` -```bash -pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysqlclient -``` -Install them if need be -```bash -sudo apt-get install libmysqlclient-dev -``` -```bash -sudo apt-get install gcc -``` - -Now you can run data reader via `pypy` -```bash -/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy clickhouse-mysql -``` - -# Examples - -## Base Example - -Let's walk over test example of tool launch command line options. -This code snippet is taken from shell script (see more details in [airline.ontime Test Case](#airlineontime-test-case)) - -```bash -$PYTHON clickhouse-mysql ${*:1} \ - --src-server-id=1 \ - --src-resume \ - --src-wait \ - --nice-pause=1 \ - --log-level=info \ - --log-file=ontime.log \ - --src-host=127.0.0.1 \ - --src-user=root \ - --dst-host=127.0.0.1 \ - --csvpool \ - --csvpool-file-path-prefix=qwe_ \ - --mempool-max-flush-interval=60 \ - --mempool-max-events-num=1000 \ - --pump-data -``` -Options description - * `--src-server-id` - Master's server id - * `--src-resume` - resume data loading from the previous point. When the tool starts - resume from the end of the log - * `--src-wait` - wait for new data to come - * `--nice-pause=1` - when no data available sleep for 1 second - * `--log-level=info` - log verbosity - * `--log-file=ontime.log` - log file name - * `--src-host=127.0.0.1` - MySQL source host - * `--src-user=root` - MySQL source user (remember about PRIVILEGES for this user) - * `--dst-host=127.0.0.1` - ClickHouse host - * `--csvpool` - make pool of csv files (assumes `--mempool` also) - * `--csvpool-file-path-prefix=qwe_` - put these CSV files having `qwe_` prefix in `CWD` - * `--mempool-max-flush-interval=60` - flush mempool at least every 60 seconds - * `--mempool-max-events-num=1000` - flush mempool at least each 1000 events (not rows, but events) - * `--pump-data` - pump data from MySQL into ClickHouse - -## MySQL Migration Case 1 - with Tables Lock - -Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) with multiple rows: - -```mysql -mysql> SELECT COUNT(*) FROM airline.ontime; -+----------+ -| count(*) | -+----------+ -| 7694964 | -+----------+ -``` - -MySQL is already configured as [described earlier](#mysql-setup). -Let's migrate existing data to ClickHouse and listen for newly coming data in order to migrate them to CLickHouse on-the-fly. - -### MySQL Migration Case 1 - Create ClickHouse Table - -Create ClickHouse table description -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=Qwerty1# \ - --create-table-sql-template \ - --with-create-database \ - --src-tables=airline.ontime > create_clickhouse_table_template.sql -``` -We have **CREATE TABLE** template stored in `create_clickhouse_table_template.sql` file. -```bash -vim create_clickhouse.sql -``` -Setup sharding field and primary key. These columns must not be `Nullable` -```bash mysql -...cut... - `Year` UInt16, -...cut... - `FlightDate` Date, -...cut... - `Month` UInt8, -...cut... -) ENGINE = MergeTree(FlightDate, (FlightDate, Year, Month), 8192) -``` - -Create table in ClickHouse -```bash -clickhouse-client -mn < create_clickhouse_table_template.sql -``` - -### MySQL Migration Case 1 - Migrate Existing Data - -Lock MySQL in order to avoid new data coming while data migration is running. Keep `mysql` client open during the whole process -```mysql -mysql> FLUSH TABLES WITH READ LOCK; -``` - -Migrate data -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=Qwerty1# \ - --migrate-table \ - --src-tables=airline.ontime \ - --dst-host=127.0.0.1 -``` -This may take some time. -Check all data is in ClickHouse -```mysql -:) select count(*) from airline.ontime; - -SELECT count(*) -FROM airline.ontime - -┌─count()─┐ -│ 7694964 │ -└─────────┘ -``` - -### MySQL Migration Case 1 - Listen For New Data - -Start `clickhouse-mysql` as a replication slave, so it will listen for new data coming: -```bash -clickhouse-mysql \ - --src-server-id=1 \ - --src-resume \ - --src-wait \ - --nice-pause=1 \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=Qwerty1# \ - --src-tables=airline.ontime \ - --dst-host=127.0.0.1 \ - --csvpool \ - --csvpool-file-path-prefix=qwe_ \ - --mempool-max-flush-interval=60 \ - --mempool-max-events-num=10000 \ - --pump-data -``` - -Allow new data to be inserted into MySQL - i.e. unlock tables. - -```mysql -mysql> UNLOCK TABLES; -``` - -Insert some data into MySQL. For example, via [clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) script - -```mysql -mysql> SELECT COUNT(*) FROM airline.ontime; -+----------+ -| count(*) | -+----------+ -| 10259952 | -+----------+ -``` - -Replication will be pumping data from MySQL into ClickHouse in background and in some time we'll see the following picture in ClickHouse: -```mysql -:) select count(*) from airline.ontime; - -SELECT count(*) -FROM airline.ontime - -┌──count()─┐ -│ 10259952 │ -└──────────┘ -``` - -## MySQL Migration Case 2 - without Tables Lock -Suppose we'd like to migrate multiple log tables of the same structure named as `log_XXX` - i.e. all of them have `log_` name prefix -into one ClickHouse table named `logunified` of the following structure -```sql -DESCRIBE TABLE logunified - -┌─name─┬─type───┬─default_type─┬─default_expression─┐ -│ id │ UInt64 │ │ │ -│ day │ Date │ │ │ -│ str │ String │ │ │ -└──────┴────────┴──────────────┴────────────────────┘ -``` -Log tables by nature are `INSERT`-only tables. Let's migrate these tables. - -### MySQL Migration Case 2 - Create ClickHouse Table -Prepare tables templates in `create_clickhouse.sql` file -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --create-table-sql-template \ - --with-create-database \ - --src-tables-prefixes=db.log_ > create_clickhouse_table_template.sql -``` -Edit templates -```bash -vim create_clickhouse_table_template.sql -``` -And create tables in ClickHouse -```bash - -clickhouse-client -mn < create_clickhouse_table_template.sql -``` - -### MySQL Migration Case 2 - Listen For New Data -```bash -clickhouse-mysql \ - --src-server-id=1 \ - --src-resume \ - --src-wait \ - --nice-pause=1 \ - --log-level=info \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --src-tables-prefixes=log_ \ - --dst-host=127.0.0.1 \ - --dst-table=logunified \ - --csvpool \ - --pump-data -``` -Pay attention to -```bash - --src-tables-prefixes=log_ \ - --dst-table=logunified \ -``` -Replication data from multiple tables into one destination table `--dst-table=logunified`. - -Monitor logs for `first row in replication` notification of the following structure: -```bash -INFO:first row in replication db.log_201801_2 -column: id=1727834 -column: day=2018-01-20 -column: str=data event 3 -``` -These records help us to create SQL statement for Data Migration process. -Sure, we can peek into MySQL database manually in order to understand what records would be the last to be copied by migration process. - -### MySQL Migration Case 2 - Migrate Existing Data - -```bash -clickhouse-mysql \ - --src-host=127.0.0.1 \ - --src-user=reader \ - --src-password=qwerty \ - --migrate-table \ - --src-tables-prefixes=db.log_ \ - --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ - --dst-host=127.0.0.1 \ - --dst-table=logunified \ - --csvpool -``` - -Pay attention to -```bash - --src-tables-prefixes=db.log_ \ - --src-tables-where-clauses=db.log_201801_1=db.log_201801_1.sql,db.log_201801_2=db.log_201801_2.sql,db.log_201801_3=db.log_201801_3.sql \ - --dst-table=logunified \ -``` -Migration subset of data described in `--src-tables-where-clauses` files from multiple tables into one destination table `--dst-table=logunified` - -Values for where clause in `db.log_201801_1.sql` are fetched from `first row in replication` log: `INFO:first row in replication db.log_201801_1` -```bash -cat db.log_201801_1.sql -id < 1727831 -``` - -Result: -```sql -:) select count(*) from logunified; - -SELECT count(*) -FROM logunified - -┌──count()─┐ -│ 12915568 │ -└──────────┘ - -``` - -## airline.ontime Test Case - -Main Steps - * Download airline.ontime dataset - * Create airline.ontime MySQL table - * Create airline.ontime ClickHouse table - * Start data reader (utility to migrate data MySQL -> ClickHouse) - * Start data importer (utility to import data into MySQL) - * Check how data are loaded into ClickHouse - -### airline.ontime Data Set in CSV files -Run [download script](clickhouse_mysql_examples/airline_ontime_data_download.sh) - -You may want to adjust dirs where to keep `ZIP` and `CSV` file - -In `airline_ontime_data_download.sh` edit these lines: -```bash -... -ZIP_FILES_DIR="zip" -CSV_FILES_DIR="csv" -... -``` -You may want to adjust number of files to download (In case downloading all it may take some time). - -Specify year and months range as you wish: -```bash -... -echo "Download files into $ZIP_FILES_DIR" -for year in `seq 1987 2017`; do - for month in `seq 1 12`; do -... -``` - -```bash -./airline_ontime_data_download.sh -``` -Downloading can take some time. - -### airline.ontime MySQL Table -Create MySQL table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql): -```bash -mysql -uroot -p < clickhouse_mysql_examples/airline_ontime_schema_mysql.sql -``` - -### airline.ontime ClickHouse Table -Create ClickHouse table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_ch.sql](clickhouse_mysql_examples/airline_ontime_schema_ch.sql): -```bash -clickhouse-client -mn < clickhouse_mysql_examples/airline_ontime_schema_ch.sql -``` - -### airline.ontime Data Reader -Run [datareader script](clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh) - -You may want to adjust `PYTHON` path and source and target hosts and usernames -```bash -... -PYTHON=python3.6 -PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -... -``` -```bash -... - --src-host=127.0.0.1 \ - --src-user=root \ - --dst-host=127.0.0.1 \ -... -``` -```bash -./airline_ontime_data_mysql_to_ch_reader.sh -``` - -### airline.ontime Data Importer -Run [data importer script](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) - -You may want to adjust `CSV` files location, number of imported files and MySQL user/password used for import -```bash -... -# looking for csv files in this dir -FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" - -# limit import to this number of files -FILES_TO_IMPORT_NUM=3 -... -``` -```bash -... - -u root \ -... -``` - -```bash -./airline_ontime_mysql_data_import.sh -``` - -# Testing - -## Testing General Schema - -### MySQL Data Types - -#### Numeric Types - - * `BIT` the number of bits per value, from 1 to 64 - * `TINYINT` -128 to 127. The unsigned range is 0 to 255 - * `BOOL`, `BOOLEAN` synonyms for `TINYINT(1)` - * `SMALLINT` -32768 to 32767. The unsigned range is 0 to 65535 - * `MEDIUMINT` -8388608 to 8388607. The unsigned range is 0 to 16777215. - * `INT`, `INTEGER` -2147483648 to 2147483647. The unsigned range is 0 to 4294967295 - * `BIGINT` -9223372036854775808 to 9223372036854775807. The unsigned range is 0 to 18446744073709551615 - - * `SERIAL` is an alias for `BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE`. - * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` A packed ?exact? fixed-point number - * `FLOAT` Permissible values are -3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38 - * `DOUBLE`, `REAL` Permissible values are -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308 - - -#### Date and Time Types - - * `DATE` The supported range is '1000-01-01' to '9999-12-31' - * `DATETIME` The supported range is '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.999999' - * `TIMESTAMP` The range is '1970-01-01 00:00:01.000000' UTC to '2038-01-19 03:14:07.999999' - * `TIME` The range is '-838:59:59.000000' to '838:59:59.000000' - * `YEAR` Values display as 1901 to 2155, and 0000 - -#### String Types - * `CHAR` The range of M is 0 to 255. If M is omitted, the length is 1. - * `VARCHAR` The range of M is 0 to 65,535 - * `BINARY` similar to CHAR - * `VARBINARY` similar to VARCHAR - * `TINYBLOB` maximum length of 255 - * `TINYTEXT` maximum length of 255 - * `BLOB` maximum length of 65,535 - * `TEXT` maximum length of 65,535 - * `MEDIUMBLOB` maximum length of 16,777,215 - * `MEDIUMTEXT` maximum length of 16,777,215 - * `LONGBLOB` maximum length of 4,294,967,295 or 4GB - * `LONGTEXT` maximum length of 4,294,967,295 or 4GB - * `ENUM` can have a maximum of 65,535 distinct elements - * `SET` can have a maximum of 64 distinct members - - * `JSON` native JSON data type defined by RFC 7159 - ---- - -### ClickHouse Data Types - - * `Date` number of days since 1970-01-01 - * `DateTime` Unix timestamp - * `Enum8` or `Enum16`. A set of enumerated string values that are stored as `Int8` or `Int16`. The numeric values must be within -128..127 for Enum8 and -32768..32767 for Enum16 - * `Float32`, `Float64` - - * `Int8` -128 127 - * `UInt8` 0 255 - - * `Int16` -32768 32767 - * `UInt16` 0 65535 - - * `Int32` -2147483648 2147483647 - * `UInt32` 0 4294967295 - - * `Int64` -9223372036854775808 9223372036854775807 - * `UInt64` 0 18446744073709551615 - - * `FixedString(N)` string of `N` bytes (not characters or code points) - * `String` The length is not limited. The value can contain an arbitrary set of bytes, including null bytes - ---- - -### MySQL -> ClickHouse Data Types Mapping - -#### Numeric Types - - * `BIT` -> ??? (possibly `String`?) - * `TINYINT` -> `Int8`, `UInt8` - * `BOOL`, `BOOLEAN` -> `UInt8` - * `SMALLINT` -> `Int16`, `UInt16` - * `MEDIUMINT` -> `Int32`, `UInt32` - * `INT`, `INTEGER` -> `Int32`, `UInt32` - * `BIGINT` -> `Int64`, `UInt64` - - * `SERIAL` -> `UInt64` - * `DEC`, `DECIMAL`, `FIXED`, `NUMERIC` -> ???? (possibly `String`?) - * `FLOAT` -> `Float32` - * `DOUBLE`, `REAL` -> `Float64` - - -#### Date and Time Types - - * `DATE` -> `Date` (for valid values) or `String` - `Date` Allows storing values from just after the beginning of the Unix Epoch - to the upper threshold defined by a constant at the compilation stage - (currently, this is until the year 2038, but it may be expanded to 2106) - * `DATETIME` -> `DateTime` (for valid values) or `String` - * `TIMESTAMP` -> `DateTime` - * `TIME` -> ????? (possibly `String`?) - * `YEAR` -> `UInt16` - - -#### String Types - - * `CHAR` -> `FixedString` - * `VARCHAR` -> `String` - * `BINARY` -> `String` - * `VARBINARY` -> `String` - * `TINYBLOB` -> `String` - * `TINYTEXT` -> `String` - * `BLOB` -> `String` - * `TEXT` -> `String` - * `MEDIUMBLOB` -> `String` - * `MEDIUMTEXT` -> `String` - * `LONGBLOB` -> `String` - * `LONGTEXT` -> `String` - -#### Set Types - * `ENUM` -> `Enum8`, `Enum16` - * `SET` -> `Array(Int8)` - -#### Custom Types - * `JSON` -> ?????? (possibly `String`?) - - -### MySQL Test Tables - -We have to separate test table into several ones because of this error, produced by MySQL: -```text -ERROR 1118 (42000): Row size too large. The maximum row size for the used table type, not counting BLOBs, is 65535. This includes storage overhead, check the manual. You have to change some columns to TEXT or BLOBs -``` - -```mysql -CREATE TABLE datatypes( - - bit_1 BIT(1), - bit_2 BIT(64), - - tinyint_1 TINYINT COMMENT '-128 to 127', - u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 BOOL, - bool_2 BOOLEAN, - - smallint_1 SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 INT COMMENT '-2147483648 to 2147483647', - u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', - timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', - time_1 TIME COMMENT '-838:59:59 to 838:59:59', - year_1 YEAR COMMENT '1901 to 2155, and 0000', - - char_0 CHAR(0), - char_1 CHAR(1), - char_2 CHAR(255), - - varchar_0 VARCHAR(0), - varchar_1 VARCHAR(1), - - binary_0 BINARY(0) COMMENT 'similar to CHAR', - binary_1 BINARY(1) COMMENT 'similar to CHAR', - binary_2 BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' -) -; - -CREATE TABLE enum_datatypes( - enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' -) -; - -CREATE TABLE set_datatypes( - set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members' -) -; - -CREATE TABLE json_datatypes( - json_1 JSON -) -; - -CREATE TABLE long_varchar_datatypes( - varchar_2 VARCHAR(65532) -) -; - -CREATE TABLE long_varbinary_datatypes( - varbinary_2 VARBINARY(65532) COMMENT 'similar to VARCHAR' -) -; -``` - - -```mysql --- in order to be able to set timestamp = '1970-01-01 00:00:01' -set time_zone='+00:00'; -``` - -Insert minimal acceptable values into the test table: - -```mysql --- MIN values -INSERT INTO datatypes SET - - bit_1 = 0b0, -- BIT(1), - bit_2 = 0b0, -- BIT(64), - - tinyint_1 = -128, -- TINYINT COMMENT '-128 to 127', - u_tinyint_1 = 0, -- TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 = FALSE, -- BOOL, - bool_2 = FALSE, -- BOOLEAN, - - smallint_1 = -32768, -- SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 = 0, -- SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 = -8388608, -- MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 = 0, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 = -2147483648, -- INT COMMENT '-2147483648 to 2147483647', - u_int_1 = 0, -- INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 = -2147483648, -- INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 = 0, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 = -9223372036854775808, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 = 0, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 = 0, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 = -9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 = -9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 = -9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 = -9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 = -3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 = 0, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 = -1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 = 0, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 = -1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 = 0, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 = '1970-01-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 = '1970-01-01 00:00:00', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', - timestamp_1 = '1970-01-01 00:00:01', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', - time_1 = '-838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', - year_1 = 1901, -- YEAR COMMENT '1901 to 2155, and 0000', - - char_0 = '', -- CHAR(0), - char_1 = '', -- CHAR(1), - char_2 = '', -- CHAR(255), - - varchar_0 = '', -- VARCHAR(0), - varchar_1 = '', -- VARCHAR(1), - - binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', - binary_1 = '', -- BINARY(1) COMMENT 'similar to CHAR', - binary_2 = '', -- BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 = '', -- VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 = '', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 = '', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 = '', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 = '', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 = '', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 = '', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 = '', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 = '' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' -; - -INSERT INTO enum_datatypes SET - enum_1 = NULL -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' -; - -INSERT INTO set_datatypes SET - set_1 = '' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members' -; - -INSERT INTO json_datatypes SET - json_1 = '{}' -- JSON -; - -INSERT INTO long_varchar_datatypes SET - varchar_2 = "" -; - -INSERT INTO long_varbinary_datatypes SET - varbinary_2 = "" -; -``` - -Insert maximum acceptable values into the test table: - -```mysql --- MAX values -INSERT INTO datatypes SET - - bit_1 = 0b1, -- BIT(1), - bit_2 = 0b1111111111111111111111111111111111111111111111111111111111111111, -- BIT(64), - - tinyint_1 = 127, -- TINYINT COMMENT '-128 to 127', - u_tinyint_1 = 255, -- TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 = TRUE, -- BOOL, - bool_2 = TRUE, -- BOOLEAN, - - smallint_1 = 32767, -- SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 = 65535, -- SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 = 8388607, -- MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 = 16777215, -- MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 = 2147483647, -- INT COMMENT '-2147483648 to 2147483647', - u_int_1 = 4294967295, -- INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 = 2147483647, -- INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 = 4294967295, -- INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 = 9223372036854775807, -- BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 = 18446744073709551615, -- BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 = 18446744073709551615, -- SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 = 9.99, -- DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 = 9.99, -- DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 = 9.99, -- FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 = 9.99, -- NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 = 3.402823466E+38, -- FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 = 3.402823466E+38, -- FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 = 1.7976931348623157E+308, -- DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 = 1.7976931348623157E+308, -- DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 = 1.7976931348623157E+308, -- REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 = 1.7976931348623157E+308, -- REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 = '2149-06-01', -- DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 = '2106-02-01 23:59:59', -- DATETIME COMMENT '1000-01-01 00:00:00 to 9999-12-31 23:59:59', - timestamp_1 = '2038-01-19 03:14:07', -- TIMESTAMP COMMENT '1970-01-01 00:00:01 UTC to 2038-01-19 03:14:07 UTC', - time_1 = '838:59:59', -- TIME COMMENT '-838:59:59 to 838:59:59', - year_1 = 2155, -- YEAR COMMENT '1901 to 2155, and 0000', - - char_0 = '', -- CHAR(0), - char_1 = 'a', -- CHAR(1), - char_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- CHAR(255), - - varchar_0 = '', -- VARCHAR(0), - varchar_1 = 'a', -- VARCHAR(1), - - binary_0 = '', -- BINARY(0) COMMENT 'similar to CHAR', - binary_1 = 'a', -- BINARY(1) COMMENT 'similar to CHAR', - binary_2 = 'abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcde', -- BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 = '', -- VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 = 'a', -- VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 = 'a', -- TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 = 'a', -- TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 = 'a', -- BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 = 'a', -- TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 = 'a', -- MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 = 'a', -- MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 = 'a', -- LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 = 'a' -- LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters' -; - -INSERT INTO enum_datatypes SET - enum_1 = 'a' -- ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements' -; - -INSERT INTO set_datatypes SET - set_1 = 'a,b,c' -- SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 64 distinct members', -; - -INSERT INTO json_datatypes SET - json_1 = '{"a":1, "b":2, "c":3}' -- JSON -; - -INSERT INTO long_varchar_datatypes SET - varchar_2 = "abc" -; - -INSERT INTO long_varbinary_datatypes SET - varbinary_2 = "abc" -; -``` - -### ClickHouse Test Tables - -```sql -CREATE TABLE datatypes( - bit_1 Nullable(String), -- bit_1 BIT(1), - bit_2 Nullable(String), -- bit_2 BIT(64), - - tinyint_1 Nullable(Int8), -- tinyint_1 TINYINT COMMENT '-128 to 127', - u_tinyint_1 Nullable(UInt8), -- u_tinyint_1 TINYINT UNSIGNED COMMENT '0 to 255', - - bool_1 Nullable(UInt8), -- bool_1 BOOL, - bool_2 Nullable(UInt8), -- bool_2 BOOLEAN, - - smallint_1 Nullable(Int16), -- smallint_1 SMALLINT COMMENT '-32768 to 32767', - u_smallint_1 Nullable(UInt16), -- u_smallint_1 SMALLINT UNSIGNED COMMENT '0 to 65535', - - mediumint_1 Nullable(Int32), -- mediumint_1 MEDIUMINT COMMENT '-8388608 to 8388607', - u_mediumint_1 Nullable(UInt32), -- u_mediumint_1 MEDIUMINT UNSIGNED COMMENT '0 to 16777215', - - int_1 Nullable(Int32), -- int_1 INT COMMENT '-2147483648 to 2147483647', - u_int_1 Nullable(UInt32), -- u_int_1 INT UNSIGNED COMMENT '0 to 4294967295', - - integer_1 Nullable(Int32), -- integer_1 INTEGER COMMENT '-2147483648 to 2147483647', - u_integer_1 Nullable(UInt32), -- u_integer_1 INTEGER UNSIGNED COMMENT '0 to 4294967295', - - bigint_1 Nullable(Int64), -- bigint_1 BIGINT COMMENT '-9223372036854775808 to 9223372036854775807', - u_bigint_1 Nullable(UInt64), -- u_bigint_1 BIGINT UNSIGNED COMMENT '0 to 18446744073709551615', - - serial_1 Nullable(UInt64), -- serial_1 SERIAL COMMENT 'i.e. BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 0 to 18446744073709551615', - - decimal_1 Nullable(String), -- decimal_1 DECIMAL(3,2) COMMENT 'exact fixed-point number', - dec_1 Nullable(String), -- dec_1 DEC(3,2) COMMENT 'alias for DECIMAL', - fixed_1 Nullable(String), -- fixed_1 FIXED(3,2) COMMENT 'alias for DECIMAL', - numeric_1 Nullable(String), -- numeric_1 NUMERIC(3,2) COMMENT 'alias for DECIMAL', - - float_1 Nullable(Float32), -- float_1 FLOAT COMMENT '-3.402823466E+38 to -1.175494351E-38, 0, and 1.175494351E-38 to 3.402823466E+38', - u_float_1 Nullable(Float32), -- u_float_1 FLOAT UNSIGNED COMMENT ' 0, and 1.175494351E-38 to 3.402823466E+38', - - double_1 Nullable(Float64), -- double_1 DOUBLE COMMENT '-1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_double_1 Nullable(Float64), -- u_double_1 DOUBLE UNSIGNED COMMENT ' 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - real_1 Nullable(Float64), -- real_1 REAL COMMENT 'alias for DOUBLE i.e. -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - u_real_1 Nullable(Float64), -- u_real_1 REAL UNSIGNED COMMENT 'alias for UNSIGNED DOUBLE i.e. 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308', - - date_1 Nullable(Date), -- date_1 DATE COMMENT '1000-01-01 to 9999-12-31', - datetime_1 Nullable(DateTime), -- datetime_1 DATETIME COMMENT '1000-01-01 00:00:00.000000 to 9999-12-31 23:59:59.999999', - timestamp_1 Nullable(DateTime), -- timestamp_1 TIMESTAMP COMMENT '1970-01-01 00:00:01.000000 UTC to 2038-01-19 03:14:07.999999 UTC', - time_1 Nullable(String), -- time_1 TIME COMMENT '-838:59:59.000000 to 838:59:59.000000', - year_1 Nullable(UInt16), -- year_1 YEAR COMMENT '1901 to 2155, and 0000', - - char_0 Nullable(FixedString(1)), -- char_0 CHAR(0), - char_1 Nullable(FixedString(1)), -- char_1 CHAR(1), - char_2 Nullable(FixedString(255)), -- char_2 CHAR(255), - - varchar_0 Nullable(String), -- varchar_0 VARCHAR(0), - varchar_1 Nullable(String), -- varchar_1 VARCHAR(1), - - binary_0 Nullable(String), -- binary_0 BINARY(0) COMMENT 'similar to CHAR', - binary_1 Nullable(String), -- binary_1 BINARY(1) COMMENT 'similar to CHAR', - binary_2 Nullable(String), -- binary_2 BINARY(255) COMMENT 'similar to CHAR', - - varbinary_0 Nullable(String), -- varbinary_0 VARBINARY(0) COMMENT 'similar to VARCHAR', - varbinary_1 Nullable(String), -- varbinary_1 VARBINARY(1) COMMENT 'similar to VARCHAR', - - tinyblob_1 Nullable(String), -- tinyblob_1 TINYBLOB COMMENT 'maximum length of 255 (2^8 ? 1) bytes', - tinytext_1 Nullable(String), -- tinytext_1 TINYTEXT COMMENT 'maximum length of 255 (2^8 ? 1) characters', - - blob_1 Nullable(String), -- blob_1 BLOB COMMENT 'maximum length of 65,535 (2^16 ? 1) bytes', - text_1 Nullable(String), -- text_1 TEXT COMMENT 'maximum length of 65,535 (2^16 ? 1) characters', - - mediumblob_1 Nullable(String), -- mediumblob_1 MEDIUMBLOB COMMENT 'maximum length of 16,777,215 (2^24 ? 1) bytes', - mediumtext_1 Nullable(String), -- mediumtext_1 MEDIUMTEXT COMMENT 'maximum length of 16,777,215 (2^24 ? 1) characters', - - longblob_1 Nullable(String), -- longblob_1 LONGBLOB COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) bytes', - longtext_1 Nullable(String) -- longtext_1 LONGTEXT COMMENT 'maximum length of 4,294,967,295 or 4GB (2^32 ? 1) characters', - -) ENGINE = Log -; - -CREATE TABLE enum_datatypes( - enum_1 Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6) -- enum_1 ENUM('a', 'b', 'c', 'd', 'e', 'f') COMMENT 'can have a maximum of 65,535 distinct elements', -) ENGINE = Memory -; - -CREATE TABLE set_datatypes( - set_1 Array(Enum16('a'=1, 'b'=2, 'c'=3, 'd'=4, 'e'=5, 'f'=6)) -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', -) ENGINE = Memory -; - -CREATE TABLE set_datatypes( - set_1 String -- set_1 SET('a', 'b', 'c', 'd', 'e', 'f') COMMENT ' can have a maximum of 64 distinct members', -) ENGINE = Memory -; - - -CREATE TABLE json_datatypes( - json_1 String -- json_1 JSON -) ENGINE = Memory -; - -CREATE TABLE long_varchar_datatypes( - varchar_2 String -) ENGINE = Memory -; - -CREATE TABLE long_varbinary_datatypes( - varbinary_2 String -) ENGINE = Memory -; -``` +- [Manual](docs/manual.md) +- [Usage examples](docs/usage-references.md) From e05ce7e08eaed1e75967953e3b89963b3204feff Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Mon, 24 Feb 2020 11:51:47 +0300 Subject: [PATCH 33/67] docs: fix links --- docs/manual.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/manual.md b/docs/manual.md index 1451bcf..6fa8724 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -200,7 +200,7 @@ bind-address = 0.0.0.0 # Quick Start -Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) and want to migrate it into ClickHouse. +Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](../clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) and want to migrate it into ClickHouse. Steps to do: @@ -360,7 +360,7 @@ Options description ## MySQL Migration Case 1 - with Tables Lock -Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) with multiple rows: +Suppose we have MySQL `airline.ontime` table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](../clickhouse_mysql_examples/airline_ontime_schema_mysql.sql) with multiple rows: ```mysql mysql> SELECT COUNT(*) FROM airline.ontime; @@ -464,7 +464,7 @@ Allow new data to be inserted into MySQL - i.e. unlock tables. mysql> UNLOCK TABLES; ``` -Insert some data into MySQL. For example, via [clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) script +Insert some data into MySQL. For example, via [clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh](../clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) script ```mysql mysql> SELECT COUNT(*) FROM airline.ontime; @@ -609,7 +609,7 @@ Main Steps * Check how data are loaded into ClickHouse ### airline.ontime Data Set in CSV files -Run [download script](clickhouse_mysql_examples/airline_ontime_data_download.sh) +Run [download script](../clickhouse_mysql_examples/airline_ontime_data_download.sh) You may want to adjust dirs where to keep `ZIP` and `CSV` file @@ -637,19 +637,19 @@ for year in `seq 1987 2017`; do Downloading can take some time. ### airline.ontime MySQL Table -Create MySQL table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](clickhouse_mysql_examples/airline_ontime_schema_mysql.sql): +Create MySQL table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_mysql.sql](../clickhouse_mysql_examples/airline_ontime_schema_mysql.sql): ```bash mysql -uroot -p < clickhouse_mysql_examples/airline_ontime_schema_mysql.sql ``` ### airline.ontime ClickHouse Table -Create ClickHouse table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_ch.sql](clickhouse_mysql_examples/airline_ontime_schema_ch.sql): +Create ClickHouse table of the [following structure - clickhouse_mysql_examples/airline_ontime_schema_ch.sql](../clickhouse_mysql_examples/airline_ontime_schema_ch.sql): ```bash clickhouse-client -mn < clickhouse_mysql_examples/airline_ontime_schema_ch.sql ``` ### airline.ontime Data Reader -Run [datareader script](clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh) +Run [datareader script](../clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh) You may want to adjust `PYTHON` path and source and target hosts and usernames ```bash @@ -670,7 +670,7 @@ PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy ``` ### airline.ontime Data Importer -Run [data importer script](clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) +Run [data importer script](../clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh) You may want to adjust `CSV` files location, number of imported files and MySQL user/password used for import ```bash From 26a60e68cb1fa92dcedab7ed78802bd689590542 Mon Sep 17 00:00:00 2001 From: Vladislav Klimenko Date: Thu, 16 Apr 2020 16:17:41 +0300 Subject: [PATCH 34/67] docs: remove RPM sections --- docs/manual.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/manual.md b/docs/manual.md index 6fa8724..a064842 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -2,8 +2,10 @@ * [Introduction](#introduction) * [Requirements and Installation](#requirements-and-installation) + * [PyPi Installation](#pypi-installation) * [GitHub-based Installation - Clone Sources](#github-based-installation---clone-sources) * [MySQL setup](#mysql-setup) @@ -50,6 +52,8 @@ However, you may have it called differently. Datareader can be installed either from `github` repo or from `pypi` repo. + ## PyPi Installation In case you need just to use the app - this is the most convenient way to go. From 020ffbc9ac949a0bfd7ffb3677f0364363f496fb Mon Sep 17 00:00:00 2001 From: ygnuss Date: Tue, 9 Mar 2021 13:58:48 +0100 Subject: [PATCH 35/67] Initial support for updates --- clickhouse_mysql/event/event.py | 3 + clickhouse_mysql/pool/bbpool.py | 14 +- clickhouse_mysql/pumper.py | 18 ++- clickhouse_mysql/reader/mysqlreader.py | 120 ++++++++++++++--- clickhouse_mysql/reader/reader.py | 7 + clickhouse_mysql/writer/chcsvwriter.py | 124 ++++++++++++++++- clickhouse_mysql/writer/chwriter.py | 164 +++++++++++++++++++++++ clickhouse_mysql/writer/csvwriter.py | 12 ++ clickhouse_mysql/writer/poolwriter.py | 12 ++ clickhouse_mysql/writer/processwriter.py | 66 +++++++++ clickhouse_mysql/writer/writer.py | 22 +++ 11 files changed, 539 insertions(+), 23 deletions(-) diff --git a/clickhouse_mysql/event/event.py b/clickhouse_mysql/event/event.py index 836f3d2..e018e57 100644 --- a/clickhouse_mysql/event/event.py +++ b/clickhouse_mysql/event/event.py @@ -28,6 +28,9 @@ class Event(object): # table name table = None + # primary key + primary_key = None + # /path/to/csv/file.csv filename = None diff --git a/clickhouse_mysql/pool/bbpool.py b/clickhouse_mysql/pool/bbpool.py index f15c268..c36265b 100644 --- a/clickhouse_mysql/pool/bbpool.py +++ b/clickhouse_mysql/pool/bbpool.py @@ -6,6 +6,7 @@ from clickhouse_mysql.pool.pool import Pool from clickhouse_mysql.objectbuilder import ObjectBuilder +from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent # Buckets Belts' Index Generator @@ -149,7 +150,18 @@ def rotate_belt(self, belt_index, flush=False): # time to flush data for specified key #self.writer_builder.param('csv_file_path_suffix_parts', [str(int(now)), str(self.buckets_num_total)]) writer = self.writer_builder.new() - writer.insert(self.belts[belt_index].pop()) + item = self.belts[belt_index].pop() + # process event based on its type + if isinstance(item[0].pymysqlreplication_event, WriteRowsEvent): + writer.insert(item) + elif isinstance(item[0].pymysqlreplication_event, DeleteRowsEvent): + writer.delete(item) + elif isinstance(item[0].pymysqlreplication_event, UpdateRowsEvent): + writer.update(item) + else: + # skip other unhandled events + pass + # writer.insert(self.belts[belt_index].pop()) writer.close() writer.push() writer.destroy() diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index e75bc34..a868938 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -11,7 +11,6 @@ class Pumper(object): writer = None def __init__(self, reader=None, writer=None): - self.reader = reader self.writer = writer @@ -19,6 +18,8 @@ def __init__(self, reader=None, writer=None): # subscribe on reader's event notifications self.reader.subscribe({ 'WriteRowsEvent': self.write_rows_event, + 'UpdateRowsEvent': self.update_rows_event, + 'DeleteRowsEvent': self.delete_rows_event, # 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, 'ReaderIdleEvent': self.reader_idle_event, }) @@ -46,5 +47,20 @@ def reader_idle_event(self): """ self.writer.flush() + def delete_rows_event(self, event=None): + """ + DeleteRowsEvent handler + :param event: + """ + self.writer.delete(event) + + def update_rows_event(self, event=None): + """ + UpdateRowsEvent handler + :param event: + """ + self.writer.update(event) + + if __name__ == '__main__': print("pumper") diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index f21e8b1..659ab77 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -12,7 +12,7 @@ from clickhouse_mysql.event.event import Event from clickhouse_mysql.tableprocessor import TableProcessor from clickhouse_mysql.util import Util -#from pymysqlreplication.event import QueryEvent, RotateEvent, FormatDescriptionEvent +from pymysqlreplication.event import QueryEvent, RotateEvent, FormatDescriptionEvent class MySQLReader(Reader): @@ -56,13 +56,15 @@ def __init__( self.server_id = server_id self.log_file = log_file self.log_pos = log_pos - self.schemas = None if not TableProcessor.extract_dbs(schemas, Util.join_lists(tables, tables_prefixes)) else TableProcessor.extract_dbs(schemas, Util.join_lists(tables, tables_prefixes)) + self.schemas = None if not TableProcessor.extract_dbs(schemas, Util.join_lists(tables, + tables_prefixes)) else TableProcessor.extract_dbs( + schemas, Util.join_lists(tables, tables_prefixes)) self.tables = None if tables is None else TableProcessor.extract_tables(tables) self.tables_prefixes = None if tables_prefixes is None else TableProcessor.extract_tables(tables_prefixes) self.blocking = blocking self.resume_stream = resume_stream self.nice_pause = nice_pause - self.binlog_position_file=binlog_position_file + self.binlog_position_file = binlog_position_file logging.info("raw dbs list. len()=%d", 0 if schemas is None else len(schemas)) if schemas is not None: @@ -86,7 +88,8 @@ def __init__( if tables_prefixes is not None: for table in tables_prefixes: logging.info(table) - logging.info("normalised tables-prefixes list. len()=%d", 0 if self.tables_prefixes is None else len(self.tables_prefixes)) + logging.info("normalised tables-prefixes list. len()=%d", + 0 if self.tables_prefixes is None else len(self.tables_prefixes)) if self.tables_prefixes is not None: for table in self.tables_prefixes: logging.info(table) @@ -101,21 +104,21 @@ def __init__( # we are interested in reading CH-repeatable events only only_events=[ # Possible events - #BeginLoadQueryEvent, + # BeginLoadQueryEvent, DeleteRowsEvent, - #ExecuteLoadQueryEvent, - #FormatDescriptionEvent, - #GtidEvent, - #HeartbeatLogEvent, - #IntvarEvent - #NotImplementedEvent, - #QueryEvent, - #RotateEvent, - #StopEvent, - #TableMapEvent, + # ExecuteLoadQueryEvent, + # FormatDescriptionEvent, + # GtidEvent, + # HeartbeatLogEvent, + # IntvarEvent + # NotImplementedEvent, + # QueryEvent, + # RotateEvent, + # StopEvent, + # TableMapEvent, UpdateRowsEvent, WriteRowsEvent, - #XidEvent, + # XidEvent, ], only_schemas=self.schemas, # in case we have any prefixes - this means we need to listen to all tables within specified schemas @@ -245,6 +248,9 @@ def process_write_rows_event(self, mysql_event): :param mysql_event: WriteRowsEvent instance :return: """ + + logging.debug("Received insert event for table: " + mysql_event.table) + if self.tables_prefixes: # we have prefixes specified # need to find whether current event is produced by table in 'looking-into-tables' list @@ -294,10 +300,81 @@ def process_write_rows_event(self, mysql_event): self.stat_write_rows_event_finalyse() def process_update_rows_event(self, mysql_event): - logging.info("Skip update rows") + + logging.debug("Received update event for table: " + mysql_event.table + " Schema: " + mysql_event.schema) + + # for row in mysql_event.rows: + # for key in row['before_values']: + # logging.debug("\t *%s:%s=>%s" % (key, row["before_values"][key], row["after_values"][key])) + + if self.tables_prefixes: + # we have prefixes specified + # need to find whether current event is produced by table in 'looking-into-tables' list + if not self.is_table_listened(mysql_event.table): + # this table is not listened + # processing is over - just skip event + return + + # statistics + #self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) + + if self.subscribers('UpdateRowsEvent'): + # dispatch event to subscribers + + # statistics + #self.stat_write_rows_event_all_rows(mysql_event=mysql_event) + + # dispatch Event + event = Event() + event.schema = mysql_event.schema + event.table = mysql_event.table + event.pymysqlreplication_event = mysql_event + + #self.process_first_event(event=event) + self.notify('UpdateRowsEvent', event=event) + + # self.stat_write_rows_event_finalyse() + + # logging.info("Skip update rows") def process_delete_rows_event(self, mysql_event): - logging.info("Skip delete rows") + logging.debug("Received delete event for table: " + mysql_event.table) + + """ + for row in mysql_event.rows: + for key in row['values']: + logging.debug("\t *", key, ":", row["values"][key]) + """ + + if self.tables_prefixes: + # we have prefixes specified + # need to find whether current event is produced by table in 'looking-into-tables' list + if not self.is_table_listened(mysql_event.table): + # this table is not listened + # processing is over - just skip event + return + + # statistics + #self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) + + if self.subscribers('DeleteRowsEvent'): + # dispatch event to subscribers + + # statistics + #self.stat_write_rows_event_all_rows(mysql_event=mysql_event) + + # dispatch Event + event = Event() + event.schema = mysql_event.schema + event.table = mysql_event.table + event.pymysqlreplication_event = mysql_event + + self.process_first_event(event=event) + self.notify('DeleteRowsEvent', event=event) + + # self.stat_write_rows_event_finalyse() + + # logging.info("Skip delete rows") def process_binlog_position(self, file, pos): if self.binlog_position_file: @@ -321,14 +398,16 @@ def read(self): self.stat_init_fetch_loop() try: - logging.debug('Pre-start binlog position: ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos) if self.binlog_stream.log_pos is not None else "undef") + logging.debug('Pre-start binlog position: ' + self.binlog_stream.log_file + ":" + str( + self.binlog_stream.log_pos) if self.binlog_stream.log_pos is not None else "undef") # fetch available events from MySQL for mysql_event in self.binlog_stream: # new event has come # check what to do with it - logging.debug('Got Event ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos)) + logging.debug( + 'Got Event ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos)) # process event based on its type if isinstance(mysql_event, WriteRowsEvent): @@ -393,6 +472,7 @@ def read(self): logging.info('end %d', end_timestamp) logging.info('len %d', end_timestamp - self.start_timestamp) + if __name__ == '__main__': connection_settings = { 'host': '127.0.0.1', diff --git a/clickhouse_mysql/reader/reader.py b/clickhouse_mysql/reader/reader.py index 379cf5f..c4f5246 100644 --- a/clickhouse_mysql/reader/reader.py +++ b/clickhouse_mysql/reader/reader.py @@ -18,6 +18,13 @@ class Reader(Observable): # called when Reader has no data to read 'ReaderIdleEvent': [], + + # called on each DeleteRowsEvent + 'DeleteRowsEvent': [], + + # called on each UpdateRowsEvent + 'UpdateRowsEvent': [], + } def __init__(self, converter=None, callbacks={}): diff --git a/clickhouse_mysql/writer/chcsvwriter.py b/clickhouse_mysql/writer/chcsvwriter.py index caea56e..88571c3 100644 --- a/clickhouse_mysql/writer/chcsvwriter.py +++ b/clickhouse_mysql/writer/chcsvwriter.py @@ -33,7 +33,9 @@ def __init__( dst_schema += "_all" if dst_distribute and dst_table is not None: dst_table += "_all" - logging.info("CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, dst_table)) + logging.info( + "CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, + dst_table)) self.host = connection_settings['host'] self.port = connection_settings['port'] self.user = connection_settings['user'] @@ -98,3 +100,123 @@ def insert(self, event_or_events=None): os.system(bash) pass + + def deleteRow(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to delete. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s delete %d rows', __class__, len(events)) + + for event in events: + schema = self.dst_schema if self.dst_schema else event.schema + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) + else: + table = self.dst_table if self.dst_table else event.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( + schema, + table, + ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + ) + + choptions = "" + if self.host: + choptions += " --host=" + shlex.quote(self.host) + if self.port: + choptions += " --port=" + str(self.port) + if self.user: + choptions += " --user=" + shlex.quote(self.user) + if self.password: + choptions += " --password=" + shlex.quote(self.password) + bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( + event.filename, + choptions, + sql, + ) + + logging.info('starting clickhouse-client process for delete operation') + logging.debug('starting %s', bash) + os.system(bash) + + pass + + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + logging.info('starting clickhouse-client process for update operation') + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to update. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s update %d rows', __class__, len(events)) + + for event in events: + schema = self.dst_schema if self.dst_schema else event.schema + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) + else: + table = self.dst_table if self.dst_table else event.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( + schema, + table, + ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + ) + + sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {3}'.format( + schema, + table, + ', '.join(map(lambda column, value: '`%s`=`%s' % column, event.fieldnames, event.fieldnames)) + ) + + choptions = "" + if self.host: + choptions += " --host=" + shlex.quote(self.host) + if self.port: + choptions += " --port=" + str(self.port) + if self.user: + choptions += " --user=" + shlex.quote(self.user) + if self.password: + choptions += " --password=" + shlex.quote(self.password) + bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( + event.filename, + choptions, + sql, + ) + + logging.info('starting clickhouse-client process') + logging.debug('starting %s', bash) + os.system(bash) + + pass diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 587d48f..46ad105 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -112,6 +112,170 @@ def insert(self, event_or_events=None): # all DONE + def deleteRow(self, event_or_events): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + logging.debug("Delete CHWriter") + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to insert. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s delete %d event(s)', __class__, len(events)) + + # verify and converts events and consolidate converted rows from all events into one batch + + rows = [] + event_converted = None + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), + __class__) + continue # for event + + event_converted = self.convert(event) + for row in event_converted: + for key in row.keys(): + # we need to convert Decimal value to str value for suitable for table structure + if type(row[key]) == Decimal: + row[key] = str(row[key]) + rows.append(row) + + logging.debug('class:%s delete %d row(s)', __class__, len(rows)) + + # determine target schema.table + + schema = self.dst_schema if self.dst_schema else event_converted.schema + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) + else: + table = self.dst_table if self.dst_table else event_converted.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, + self.dst_table)) + + # and DELETE converted rows + + sql = '' + try: + sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( + schema, + table, + ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + ) + self.client.execute(sql, rows) + except Exception as ex: + logging.critical('QUERY FAILED') + logging.critical('ex={}'.format(ex)) + logging.critical('sql={}'.format(sql)) + sys.exit(0) + + # all DONE + + """ + Get string format pattern for update and delete operations + """ + def get_data_format(self, column, value): + t = type(value) + if t == str: + return "`%s`='%s'" % (column, value) + else: + # int, float + return "`%s`=%s" % (column, value) + + def update(self, event_or_events): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + logging.debug("Update CHWriter") + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to update. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s update %d event(s)', __class__, len(events)) + + # verify and converts events and consolidate converted rows from all events into one batch + + rows = [] + event_converted = None + pk = None + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), + __class__) + continue # for event + + event_converted = self.convert(event) + pk = event_converted.pymysqlreplication_event.primary_key + for row in event_converted.pymysqlreplication_event.rows: + for key in row['after_values'].keys(): + # we need to convert Decimal value to str value for suitable for table structure + if type(row['after_values'][key]) == Decimal: + row['after_values'][key] = str(row['after_values'][key]) + rows.append(row) + + logging.debug('class:%s update %d row(s)', __class__, len(rows)) + + # determine target schema.table + + schema = self.dst_schema if self.dst_schema else event_converted.schema + table = None + if self.dst_distribute: + table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) + else: + table = self.dst_table if self.dst_table else event_converted.table + if self.dst_schema: + table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, + self.dst_table)) + + # and UPDATE converted rows + + sql = '' + try: + sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2} where {3}'.format( + schema, + table, + ', '.join(filter(None, map(lambda column, value: "" if column == pk else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), + ' and '.join(map(lambda column, value: self.get_data_format(column, value), row['before_values'].keys(), row['before_values'].values())) + ) + + # sql = "ALTER TABLE `test`.`animals` UPDATE `name`='pajaroTO', `position`=1 where `id`=1 and `name`='oso'" + self.client.execute(sql) + except Exception as ex: + logging.critical('QUERY FAILED') + logging.critical('ex={}'.format(ex)) + logging.critical('sql={}'.format(sql)) + # sys.exit(0) + + # all DONE + + + if __name__ == '__main__': connection_settings = { diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 4ff9081..18cfda6 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -135,6 +135,18 @@ def insert(self, event_or_events): for row in event: self.writer.writerow(self.convert(row)) + def deleteRow(self, event_or_events): + """ + TODO + """ + logging.debug("Delete CSV Writer") + + def update(self, event_or_events): + """ + TODO + """ + logging.debug("Update CSV Writer") + def push(self): if not self.next_writer_builder or not self.fieldnames: return diff --git a/clickhouse_mysql/writer/poolwriter.py b/clickhouse_mysql/writer/poolwriter.py index b49e011..129f05a 100644 --- a/clickhouse_mysql/writer/poolwriter.py +++ b/clickhouse_mysql/writer/poolwriter.py @@ -37,6 +37,18 @@ def insert(self, event_or_events): logging.debug('class:%s insert', __class__) self.pool.insert(event_or_events) + + def delete(self, event_or_events): + """Insert delete data into Pool""" + logging.debug('class:%s delete', __class__) + self.pool.insert(event_or_events) + + def update(self, event_or_events): + """Insert update data into Pool""" + logging.debug('class:%s update', __class__) + self.pool.insert(event_or_events) + + def flush(self): self.pool.flush() diff --git a/clickhouse_mysql/writer/processwriter.py b/clickhouse_mysql/writer/processwriter.py index 226b72b..8177345 100644 --- a/clickhouse_mysql/writer/processwriter.py +++ b/clickhouse_mysql/writer/processwriter.py @@ -35,6 +35,28 @@ def process(self, event_or_events=None): writer.destroy() logging.debug('class:%s process() done', __class__) + def processDelete(self, event_or_events=None): + """Separate process body to be run""" + + logging.debug('class:%s process()', __class__) + writer = self.next_writer_builder.get() + writer.deleteRow(event_or_events) + writer.close() + writer.push() + writer.destroy() + logging.debug('class:%s process() done', __class__) + + def processUpdate(self, event_or_events=None): + """Separate process body to be run""" + + logging.debug('class:%s process()', __class__) + writer = self.next_writer_builder.get() + writer.delete(event_or_events) + writer.close() + writer.push() + writer.destroy() + logging.debug('class:%s process() done', __class__) + def insert(self, event_or_events=None): # event_or_events = [ # event: { @@ -57,6 +79,50 @@ def insert(self, event_or_events=None): logging.debug('class:%s insert done', __class__) pass + def delete(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # start separated process with event_or_events to be inserted + + logging.debug('class:%s delete', __class__) + process = mp.Process(target=self.processDelete, args=(event_or_events,)) + + logging.debug('class:%s delete.process.start()', __class__) + process.start() + + #process.join() + logging.debug('class:%s delete done', __class__) + pass + + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # start separated process with event_or_events to be inserted + + logging.debug('class:%s update', __class__) + process = mp.Process(target=self.processUpdate, args=(event_or_events,)) + + logging.debug('class:%s update.process.start()', __class__) + process.start() + + #process.join() + logging.debug('class:%s update done', __class__) + pass + def flush(self): pass diff --git a/clickhouse_mysql/writer/writer.py b/clickhouse_mysql/writer/writer.py index 11f788c..3be276b 100644 --- a/clickhouse_mysql/writer/writer.py +++ b/clickhouse_mysql/writer/writer.py @@ -55,6 +55,28 @@ def insert(self, event_or_events=None): # ] pass + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + pass + + def delete(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + pass + def flush(self): pass From 5212148bdf1512a7682532cd789f1fdda7e76a4f Mon Sep 17 00:00:00 2001 From: ygnuss Date: Wed, 10 Mar 2021 11:43:51 +0100 Subject: [PATCH 36/67] Fixed some errors with null values (None) for update events --- clickhouse_mysql/writer/chwriter.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 46ad105..5774091 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -10,7 +10,7 @@ from clickhouse_mysql.writer.writer import Writer from clickhouse_mysql.tableprocessor import TableProcessor -from clickhouse_mysql.event.event import Event +import datetime class CHWriter(Writer): @@ -108,7 +108,8 @@ def insert(self, event_or_events=None): logging.critical('QUERY FAILED') logging.critical('ex={}'.format(ex)) logging.critical('sql={}'.format(sql)) - sys.exit(0) + logging.critical('data={}'.format(rows)) + # sys.exit(0) # all DONE @@ -181,7 +182,7 @@ def deleteRow(self, event_or_events): logging.critical('QUERY FAILED') logging.critical('ex={}'.format(ex)) logging.critical('sql={}'.format(sql)) - sys.exit(0) + # sys.exit(0) # all DONE @@ -190,7 +191,7 @@ def deleteRow(self, event_or_events): """ def get_data_format(self, column, value): t = type(value) - if t == str: + if t == str or t is datetime.datetime: return "`%s`='%s'" % (column, value) else: # int, float @@ -260,11 +261,12 @@ def update(self, event_or_events): sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2} where {3}'.format( schema, table, - ', '.join(filter(None, map(lambda column, value: "" if column == pk else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), - ' and '.join(map(lambda column, value: self.get_data_format(column, value), row['before_values'].keys(), row['before_values'].values())) + ', '.join(filter(None, map(lambda column, value: "" if column == pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), + ' and '.join(filter(None, map( + lambda column, value: "" if column != pk or value is None else self.get_data_format(column, value), + row['before_values'].keys(), row['before_values'].values()))) ) - # sql = "ALTER TABLE `test`.`animals` UPDATE `name`='pajaroTO', `position`=1 where `id`=1 and `name`='oso'" self.client.execute(sql) except Exception as ex: logging.critical('QUERY FAILED') From cc0a593a6ac2b4f4981a7c7a59e40eb582032d43 Mon Sep 17 00:00:00 2001 From: ygnuss Date: Wed, 10 Mar 2021 11:44:09 +0100 Subject: [PATCH 37/67] Added requirements file --- requirements.txt | 108 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bce8d28 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,108 @@ +appdirs==1.4.4 +apturl==0.5.2 +astroid==2.4.2 +basictracer==3.1.0 +blinker==1.4 +Brlapi==0.7.0 +cachetools==4.0.0 +certifi==2020.12.5 +chardet==3.0.4 +chrome-gnome-shell==0.0.0 +Click==7.0 +clickhouse-driver==0.0.10 +clickhouse-toolset @ file:///tmp/clickhouse_toolset-0.9.dev0-cp38-cp38-linux_x86_64.whl +colorama==0.4.3 +command-not-found==0.3 +configobj==5.0.6 +crcmod==1.7 +cryptography==3.0 +cupshelpers==1.0 +datasketch==1.2.10 +dbus-python==1.2.16 +defer==1.0.6 +distlib==0.3.1 +distro==1.5.0 +distro-info===0.23ubuntu1 +dlib==19.16.0 +filelock==3.0.12 +httplib2==0.18.1 +humanfriendly==8.2 +idna==2.6 +importlib-metadata==1.6.0 +isort==5.7.0 +jeepney==0.4.3 +keyring==21.3.0 +language-selector==0.1 +launchpadlib==1.10.13 +lazr.restfulclient==0.14.2 +lazr.uri==1.0.5 +lazy-object-proxy==1.4.3 +louis==3.14.0 +macaroonbakery==1.3.1 +Markdown==3.2.1 +mccabe==0.6.1 +more-itertools==4.2.0 +msal==1.5.0 +netifaces==0.10.4 +numpy==1.15.0 +oauthlib==3.1.0 +olefile==0.46 +opencv-python==4.4.0.46 +opentracing==2.0.0 +passlib==1.7.1 +pexpect==4.6.0 +Pillow==7.2.0 +powerline-status==2.8.1 +protobuf==3.12.3 +psutil==5.6.3 +psycopg2-binary==2.8.5 +pycairo==1.16.2 +pycups==2.0.1 +pycurl==7.43.0.6 +Pygments==2.3.1 +PyGObject==3.38.0 +PyJWT==1.6.4 +pylint==2.6.0 +pymacaroons==0.13.0 +PyMySQL==1.0.2 +PyNaCl==1.4.0 +pyRFC3339==1.1 +python-apt==2.1.3+ubuntu1.3 +python-dateutil==2.8.1 +python-debian==0.1.37 +pytz==2020.1 +pyxdg==0.26 +PyYAML==5.3.1 +rangehttpserver==1.2.0 +redis==3.2.1 +reportlab==3.5.47 +requests==2.18.4 +requests-toolbelt==0.9.1 +requests-unixsocket==0.2.0 +screen-resolution-extra==0.0.0 +SecretStorage==3.1.2 +simplejson==3.17.0 +six==1.15.0 +streaming-form-data==1.1.0 +systemd-python==234 +tabulate==0.8.3 +terminator==1.92 +-e git+git@gitlab.com:tinybird/analytics.git@0d13783b7e38c0decc97ac06901e8ce7b804221e#egg=tinybird +tinybird-cli==1.0.0b12 +TLPUI==1.3.1.post3 +toml==0.10.2 +toposort==1.5 +tornado==5.1.1 +tornado-opentracing==1.0.1 +torngithub==0.2.0 +ubuntu-advantage-tools==24.4 +ubuntu-drivers-common==0.0.0 +ufw==0.36 +unattended-upgrades==0.1 +urllib3==1.22 +vboxapi==1.0 +virtualenv==20.0.29+ds +wadllib==1.3.4 +wrapt==1.12.1 +xkit==0.0.0 +zipp==1.0.0 From e68ff22fee097af30de08169222fb3bc198547b8 Mon Sep 17 00:00:00 2001 From: ygnuss Date: Wed, 10 Mar 2021 12:02:05 +0100 Subject: [PATCH 38/67] Fixed handling delete events --- clickhouse_mysql/pumper.py | 2 +- clickhouse_mysql/writer/chwriter.py | 33 +++++++++++++++++++++++------ clickhouse_mysql/writer/writer.py | 12 ++++++++--- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index a868938..0b5d0c3 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -52,7 +52,7 @@ def delete_rows_event(self, event=None): DeleteRowsEvent handler :param event: """ - self.writer.delete(event) + self.writer.delete_row(event) def update_rows_event(self, event=None): """ diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 5774091..5a243ae 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -113,7 +113,7 @@ def insert(self, event_or_events=None): # all DONE - def deleteRow(self, event_or_events): + def delete_row(self, event_or_events): # event_or_events = [ # event: { # row: {'id': 3, 'a': 3} @@ -138,6 +138,7 @@ def deleteRow(self, event_or_events): rows = [] event_converted = None + pk = None for event in events: if not event.verify: logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), @@ -145,6 +146,7 @@ def deleteRow(self, event_or_events): continue # for event event_converted = self.convert(event) + pk = event_converted.pymysqlreplication_event.primary_key for row in event_converted: for key in row.keys(): # we need to convert Decimal value to str value for suitable for table structure @@ -170,14 +172,27 @@ def deleteRow(self, event_or_events): # and DELETE converted rows + sql = '' + # try: + # sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( + # schema, + # table, + # ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + # ) + # self.client.execute(sql, rows) + sql = '' try: - sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( + sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2}'.format( schema, table, - ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + ' and '.join(filter(None, map( + lambda column, value: "" if column != pk else self.get_data_format(column, value), + row.keys(), row.values()))) ) - self.client.execute(sql, rows) + + self.client.execute(sql) + except Exception as ex: logging.critical('QUERY FAILED') logging.critical('ex={}'.format(ex)) @@ -200,10 +215,16 @@ def get_data_format(self, column, value): def update(self, event_or_events): # event_or_events = [ # event: { - # row: {'id': 3, 'a': 3} + # row: { + # 'before_values': {'id': 3, 'a': 3}, + # 'after_values': {'id': 3, 'a': 2} + # } # }, # event: { - # row: {'id': 3, 'a': 3} + # row: { + # 'before_values': {'id': 2, 'a': 3}, + # 'after_values': {'id': 2, 'a': 2} + # } # }, # ] diff --git a/clickhouse_mysql/writer/writer.py b/clickhouse_mysql/writer/writer.py index 3be276b..1bfaeb0 100644 --- a/clickhouse_mysql/writer/writer.py +++ b/clickhouse_mysql/writer/writer.py @@ -58,15 +58,21 @@ def insert(self, event_or_events=None): def update(self, event_or_events=None): # event_or_events = [ # event: { - # row: {'id': 3, 'a': 3} + # row: { + # 'before_values': {'id': 3, 'a': 3}, + # 'after_values': {'id': 3, 'a': 2} + # } # }, # event: { - # row: {'id': 3, 'a': 3} + # row: { + # 'before_values': {'id': 2, 'a': 3}, + # 'after_values': {'id': 2, 'a': 2} + # } # }, # ] pass - def delete(self, event_or_events=None): + def delete_row(self, event_or_events=None): # event_or_events = [ # event: { # row: {'id': 3, 'a': 3} From 09a2f9252f88d947f026687700dd9800c81f1d4b Mon Sep 17 00:00:00 2001 From: ygnuss Date: Wed, 10 Mar 2021 16:50:47 +0100 Subject: [PATCH 39/67] Added audit column for tracking changes in Clickhouse --- clickhouse_mysql/writer/chwriter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 5a243ae..fbe9613 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -206,7 +206,9 @@ def delete_row(self, event_or_events): """ def get_data_format(self, column, value): t = type(value) - if t == str or t is datetime.datetime: + if t == str: + return "`%s`='%s'" % (column, value.replace("'", "\\'")) + elif t is datetime.datetime: return "`%s`='%s'" % (column, value) else: # int, float @@ -279,10 +281,11 @@ def update(self, event_or_events): sql = '' try: - sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2} where {3}'.format( + sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2}, `tb_upd`={3} where {4}'.format( schema, table, ', '.join(filter(None, map(lambda column, value: "" if column == pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), + "'%s'" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ' and '.join(filter(None, map( lambda column, value: "" if column != pk or value is None else self.get_data_format(column, value), row['before_values'].keys(), row['before_values'].values()))) From 826b568b512c81e137c12e9058982d7dcb547fdf Mon Sep 17 00:00:00 2001 From: ygnuss Date: Tue, 16 Mar 2021 09:39:17 +0100 Subject: [PATCH 40/67] Added scripts for running listeners --- run-listeners.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++++ stop-listeners.sh | 23 +++++++++++++++++++++ tb_tables.config | 9 +++++++++ 3 files changed, 83 insertions(+) create mode 100755 run-listeners.sh create mode 100755 stop-listeners.sh create mode 100644 tb_tables.config diff --git a/run-listeners.sh b/run-listeners.sh new file mode 100755 index 0000000..60a5996 --- /dev/null +++ b/run-listeners.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +LOG_LEVEL=debug + +SOURCE_HOST=127.0.0.1 +SOURCE_PORT=3307 +DESTINATION_HOST=127.0.0.1 +SOURCE_USER=tinybird +SOURCE_PASSWD=goo7eu9AeS3i + +PID_LOG_FILE=/tmp/listeners-pid.log + +source tb_tables.config + +############################################################ +# Run a process to synchronize MySQL table using binlog. +# +# $1 --> Source schema +# $2 --> Source table +# $3 --> Destination schema +# $4 --> Destination table +# $5 --> Server id +# $6 --> Log file +# $7 --> Binlog position file +# +############################################################# +run_listener() { + + (clickhouse-mysql --src-server-id=$5 --src-wait --src-resume --binlog-position-file $7 --nice-pause=1 --src-host=$SOURCE_HOST --src-port=$SOURCE_PORT --src-user=$SOURCE_USER --src-password=$SOURCE_PASSWD --src-schemas=$1 --src-tables=$2 --dst-host=$DESTINATION_HOST --dst-schema=$3 --dst-table=$4 --log-level=$LOG_LEVEL --pump-data 2>> $6)& + +} + +run_listener "movida_preproduction" "schedulings" "$TB_DATABASE" "$SCHEDULINGS_TABLE" "91" "out-schedulings.log" "bl-pos-schedulings" +echo $! > $PID_LOG_FILE + +run_listener "movida_preproduction" "platforms" "$TB_DATABASE" "$PLATFORMS_TABLE" "92" "out-platforms.log" "bl-pos-platforms" +echo $! >> $PID_LOG_FILE + +run_listener "movida_preproduction" "titles" "$TB_DATABASE" "$TITLES_TABLE" "93" "out-titles.log" "bl-pos-titles" +echo $! >> $PID_LOG_FILE + +run_listener "movida_preproduction" "assets" "$TB_DATABASE" "$ASSETS_TABLE" "94" "out-assets.log" "bl-pos-assets" +echo $! >> $PID_LOG_FILE + +run_listener "movida_preproduction" "features" "$TB_DATABASE" "$FEATURES_TABLE" "95" "out-features.log" "bl-pos-features" +echo $! >> $PID_LOG_FILE + +run_listener "movida_preproduction" "collection_entries" "$TB_DATABASE" "$COLLECTIONS_TABLE" "96" "out-collections.log" "bl-pos-collections" +echo $! >> $PID_LOG_FILE + +echo "PID processes in $PID_LOG_FILE" \ No newline at end of file diff --git a/stop-listeners.sh b/stop-listeners.sh new file mode 100755 index 0000000..582e97c --- /dev/null +++ b/stop-listeners.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +PID_LOG_FILE=/tmp/listeners-pid.log + +count_processes() { + echo `ps aux | grep clickhouse-mysql-data-reader | wc -l` +} + +total_before=$(count_processes) + +while IFS= read -r line +do + echo "$line" + kill $line +done < "$PID_LOG_FILE" + +total_after=$(count_processes) + +procs=`echo "$total_after - 1" | bc` + +if [ $total_after -gt 1 ]; then + echo "You still have $procs processes running" +fi \ No newline at end of file diff --git a/tb_tables.config b/tb_tables.config new file mode 100644 index 0000000..be59079 --- /dev/null +++ b/tb_tables.config @@ -0,0 +1,9 @@ +#!/bin/bash + +TB_DATABASE='d_073c5e' +TITLES_TABLE='t_8a192b9c7ece4572a5a2fc9858e26d5c' +ASSETS_TABLE='t_4c03fdeb4e3e4db784ead40b06ec8617' +COLLECTIONS_TABLE='t_3dd7b323438943c687bd4e13a0e181a1' +FEATURES_TABLE='t_23f41723e0eb480088cbb1c8f890a38c' +PLATFORMS_TABLE='t_83f598dc74254de68216a7c7735caffb' +SCHEDULINGS_TABLE='t_b5e541d4e73d4301ba736c427bd667c5' \ No newline at end of file From 6683cd9daf3bde034c3878708cd84bdc5228bad5 Mon Sep 17 00:00:00 2001 From: ygnuss Date: Tue, 16 Mar 2021 09:47:22 +0100 Subject: [PATCH 41/67] Added dumper script and moved all init scripts to init folder --- init/dump-tables.sh | 132 ++++++++++++++++++++ run-listeners.sh => init/run-listeners.sh | 0 stop-listeners.sh => init/stop-listeners.sh | 0 tb_tables.config => init/tb_tables.config | 0 4 files changed, 132 insertions(+) create mode 100755 init/dump-tables.sh rename run-listeners.sh => init/run-listeners.sh (100%) rename stop-listeners.sh => init/stop-listeners.sh (100%) rename tb_tables.config => init/tb_tables.config (100%) diff --git a/init/dump-tables.sh b/init/dump-tables.sh new file mode 100755 index 0000000..25eb02b --- /dev/null +++ b/init/dump-tables.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit -1 +fi + +DUMP_PATH=$1 + +source tb_tables.config + +########### +### titles +########### + +echo "Dumping titles" +mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction titles > $DUMP_PATH/titles.sql + +echo "use $TB_DATABASE;" > $DUMP_PATH/titles-insert-tb.sql +cat $DUMP_PATH/titles.sql | grep "INSERT INTO" >> $DUMP_PATH/titles-insert-tb.sql +sed -i 's/INSERT INTO `titles` VALUES/INSERT INTO `t_8a192b9c7ece4572a5a2fc9858e26d5c` (`id`, `name`, `licensor_id`, `created_at`, `updated_at`, `company_id`, `series_id`, `external_id`, `poster_file_name`, `poster_content_type`, `poster_file_size`, `poster_updated_at`, `episode_number`, `dirty_episode_number`, `rights_count`, `blackouts_count`, `denied_rights_count`, `images_count`, `cover_image_id`, `title_type`, `metadata_updated_at`, `promoted_content_id`, `promoted_content_type`, `soft_destroyed`, `credits_count`, `translated_attributes`, `rules_count`, `discarded`, `episode_reference_id`, `brand_id`) VALUES/g' $DUMP_PATH/titles-insert-tb.sql + +echo "Truncate titles table" +echo "truncate $TB_DATABASE.$TITLES_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn + +echo "Loading titles into CH" +cat $DUMP_PATH/titles-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn +echo "Titles loaded" + +read -p "Press enter to continue" + +########### +### assets +########### + +echo "Dumping assets" +mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction assets > $DUMP_PATH/assets.sql + +echo "use $TB_DATABASE;" > $DUMP_PATH/assets-insert-tb.sql +cat $DUMP_PATH/assets.sql | grep "INSERT INTO" >> $DUMP_PATH/assets-insert-tb.sql +sed -i 's/INSERT INTO `assets` VALUES/INSERT INTO `t_4c03fdeb4e3e4db784ead40b06ec8617` (`id`, `name`, `title_id`, `created_at`, `updated_at`, `description`, `runtime_in_milliseconds`, `metadata_updated_at`, `company_id`, `asset_type_enumeration_entry_id`, `external_id`) VALUES/g' $DUMP_PATH/assets-insert-tb.sql + +echo "Truncate assets table" +echo "truncate $TB_DATABASE.$ASSETS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn + +echo "Loading assets into CH" +cat $DUMP_PATH/assets-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn +echo "Assets loaded" + +read -p "Press enter to continue" + +####################### +### Collection-entries +####################### + +echo "Dumping collection-entries" +mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction collection_entries > $DUMP_PATH/collections.sql + +echo "use $TB_DATABASE;" > $DUMP_PATH/collections-insert-tb.sql +cat $DUMP_PATH/collections.sql | grep "INSERT INTO" >> $DUMP_PATH/collections-insert-tb.sql +sed -i 's/INSERT INTO `collection_entries` VALUES/INSERT INTO `t_3dd7b323438943c687bd4e13a0e181a1` (`collection_id`, `title_id`, `id`, `position`) VALUES/g' $DUMP_PATH/collections-insert-tb.sql + +echo "Truncate collections table" +echo "truncate $TB_DATABASE.$COLLECTIONS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn + +echo "Loading collection-entries into CH" +cat $DUMP_PATH/collections-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn +echo "Collection-entries loaded" + +read -p "Press enter to continue" + +############## +### Features +############## + +echo "Dumping features" +mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction features > $DUMP_PATH/features.sql + +echo "use $TB_DATABASE;" > $DUMP_PATH/features-insert-tb.sql +read -p "Press enter to continue use" +cat $DUMP_PATH/features.sql | grep "INSERT INTO" >> $DUMP_PATH/features-insert-tb.sql +read -p "Press enter to continue insert" +sed -i 's/INSERT INTO `features` VALUES/INSERT INTO `t_23f41723e0eb480088cbb1c8f890a38c` (`id`, `name`, `enabled`, `company_id`, `created_at`, `updated_at`) VALUES/g' $DUMP_PATH/features-insert-tb.sql +read -p "Press enter to continue sed" +echo "Truncate features table" +echo "truncate $TB_DATABASE.$FEATURES_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn + +echo "Loading features into CH" +cat $DUMP_PATH/features-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn +echo "Features loaded" + +read -p "Press enter to continue" + +############## +### Platforms +############## + +echo "Dumping platforms" +mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction platforms > $DUMP_PATH/platforms.sql + +echo "use $TB_DATABASE;" > $DUMP_PATH/platforms-insert-tb.sql +cat $DUMP_PATH/platforms.sql | grep "INSERT INTO" >> $DUMP_PATH/platforms-insert-tb.sql +sed -i 's/INSERT INTO `platforms` VALUES/INSERT INTO `t_83f598dc74254de68216a7c7735caffb` (`id`, `company_id`, `name`, `created_at`, `updated_at`, `sequence_service_titles_url`, `_deprecated_sequence_template_name`, `_deprecated_owned`, `sequence_template_url`, `metadata_constant_name`, `outlet_id`, `automatic_publication_enabled`, `metadata_updated_at`, `granted_categories`, `external_id`, `timezone`) VALUES/g' $DUMP_PATH/platforms-insert-tb.sql + +echo "Truncate platforms table" +echo "truncate $TB_DATABASE.$PLATFORMS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn + +echo "Loading platforms into CH" +cat $DUMP_PATH/platforms-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn +echo "Platforms loaded" + +read -p "Press enter to continue" + +################# +### Schedulings +################# + +echo "Dumping schedulings" +mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction schedulings > $DUMP_PATH/schedulings.sql + +echo "use $TB_DATABASE;" > $DUMP_PATH/schedulings-insert-tb.sql +cat $DUMP_PATH/schedulings.sql | grep "INSERT INTO" >> $DUMP_PATH/schedulings-insert-tb.sql +sed -i 's/INSERT INTO `schedulings` VALUES/INSERT INTO `t_b5e541d4e73d4301ba736c427bd667c5` (`id`, `title_id`, `put_up`, `take_down`, `created_at`, `updated_at`, `cleared`, `platform_id`, `rule_id`, `workflow_offset`, `sequence_asset_url`, `sequence_asset_name`, `workflow_sent`, `status`, `asset_id`, `rule_asset_id`, `title_group_id`, `workflow_web_url`, `_deprecated_publication_status`, `published_at`, `_prev_put_up`, `_prev_take_down`, `_pending_simulation`, `workflow_template_url`, `original_draft_scheduling_id`, `playlist_id`, `updating_playlist`, `workflow_job_url`, `workflow_status`, `conflict_types`, `metadata_updated_at`, `company_id`, `cached_title_episode_number`, `metadata_status`, `publication_status`, `publication_status_updated_at`, `metadata_status_updated_at`, `external_id`, `disabled_at`, `scheduling_type`, `overridden_rule_attributes`, `update_in_progress`, `metadata_error_digest`) VALUES/g' $DUMP_PATH/schedulings-insert-tb.sql + +echo "Truncate schedulings table" +echo "truncate $TB_DATABASE.$SCHEDULINGS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn + +echo "Loading schedulings into CH" +cat $DUMP_PATH/schedulings-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn +echo "Schedulings loaded" + +echo "Process finished!" \ No newline at end of file diff --git a/run-listeners.sh b/init/run-listeners.sh similarity index 100% rename from run-listeners.sh rename to init/run-listeners.sh diff --git a/stop-listeners.sh b/init/stop-listeners.sh similarity index 100% rename from stop-listeners.sh rename to init/stop-listeners.sh diff --git a/tb_tables.config b/init/tb_tables.config similarity index 100% rename from tb_tables.config rename to init/tb_tables.config From a2f19a4b64cd26d2658b3ed887604062f856d52f Mon Sep 17 00:00:00 2001 From: ygnuss Date: Tue, 16 Mar 2021 09:48:04 +0100 Subject: [PATCH 42/67] Updated gitignore to ignore out log files and binlog checkpoint files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index eb18180..c44086a 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,7 @@ _build # Pyenv .python-version + +# Tinibird +bl-* +out-* \ No newline at end of file From d7bdeee7c9aad36936dadf8bd65999a6db5e8da6 Mon Sep 17 00:00:00 2001 From: ygnuss Date: Tue, 16 Mar 2021 09:49:05 +0100 Subject: [PATCH 43/67] Fix to handle multivaluated keys in asset table. This needs to be fixed to be generic. I've added this just to continue with the PoC --- clickhouse_mysql/writer/chwriter.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index fbe9613..ab29ec4 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -253,7 +253,12 @@ def update(self, event_or_events): continue # for event event_converted = self.convert(event) - pk = event_converted.pymysqlreplication_event.primary_key + pk = [event_converted.pymysqlreplication_event.primary_key] + if event_converted.table == 'assets': + pk.append('name') + pk.append('title_id') + pk.append('company_id') + pk.append('asset_type_enumeration_entry_id') for row in event_converted.pymysqlreplication_event.rows: for key in row['after_values'].keys(): # we need to convert Decimal value to str value for suitable for table structure @@ -284,10 +289,10 @@ def update(self, event_or_events): sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2}, `tb_upd`={3} where {4}'.format( schema, table, - ', '.join(filter(None, map(lambda column, value: "" if column == pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), + ', '.join(filter(None, map(lambda column, value: "" if column in pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), "'%s'" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ' and '.join(filter(None, map( - lambda column, value: "" if column != pk or value is None else self.get_data_format(column, value), + lambda column, value: "" if column not in pk or value is None else self.get_data_format(column, value), row['before_values'].keys(), row['before_values'].values()))) ) From 22cca2aa63936dac9c845726d717cb5e71a58ec6 Mon Sep 17 00:00:00 2001 From: ygnuss Date: Tue, 16 Mar 2021 09:51:45 +0100 Subject: [PATCH 44/67] Added first-processing script to initialize database without loosing data --- init/first-processing.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 init/first-processing.sh diff --git a/init/first-processing.sh b/init/first-processing.sh new file mode 100755 index 0000000..7daa44c --- /dev/null +++ b/init/first-processing.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit -1 +fi + +echo "Generate binlog timelog" +./run-listener.sh +./stop-listeners.sh + +echo "Generating dumps and loading data ..." +./dump-tables.sh $1 + +echo "Starting listeners" +./run-listener.sh + +echo "Done!" \ No newline at end of file From 9d67f60ba7617a7ede4ad63063d63f087ea878ad Mon Sep 17 00:00:00 2001 From: ygnuss Date: Mon, 12 Apr 2021 10:57:56 +0200 Subject: [PATCH 45/67] fix: Updated run script to be more flexible and support running just one process or all --- init/run-listeners.sh | 105 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 13 deletions(-) diff --git a/init/run-listeners.sh b/init/run-listeners.sh index 60a5996..21a7502 100755 --- a/init/run-listeners.sh +++ b/init/run-listeners.sh @@ -24,28 +24,107 @@ source tb_tables.config # $7 --> Binlog position file # ############################################################# -run_listener() { +function run_listener() { (clickhouse-mysql --src-server-id=$5 --src-wait --src-resume --binlog-position-file $7 --nice-pause=1 --src-host=$SOURCE_HOST --src-port=$SOURCE_PORT --src-user=$SOURCE_USER --src-password=$SOURCE_PASSWD --src-schemas=$1 --src-tables=$2 --dst-host=$DESTINATION_HOST --dst-schema=$3 --dst-table=$4 --log-level=$LOG_LEVEL --pump-data 2>> $6)& } -run_listener "movida_preproduction" "schedulings" "$TB_DATABASE" "$SCHEDULINGS_TABLE" "91" "out-schedulings.log" "bl-pos-schedulings" -echo $! > $PID_LOG_FILE +function run_schedulings() { + if [ $binlog == "true" ]; then + rm "bl-pos-collections" + fi -run_listener "movida_preproduction" "platforms" "$TB_DATABASE" "$PLATFORMS_TABLE" "92" "out-platforms.log" "bl-pos-platforms" -echo $! >> $PID_LOG_FILE + run_listener "movida_preproduction" "schedulings" "$TB_DATABASE" "$SCHEDULINGS_TABLE" "91" "out-schedulings.log" "bl-pos-schedulings" + echo $! > $PID_LOG_FILE -run_listener "movida_preproduction" "titles" "$TB_DATABASE" "$TITLES_TABLE" "93" "out-titles.log" "bl-pos-titles" -echo $! >> $PID_LOG_FILE +} + +function run_platforms() { + if [ $binlog == "true" ]; then + rm "bl-pos-collections" + fi + + run_listener "movida_preproduction" "platforms" "$TB_DATABASE" "$PLATFORMS_TABLE" "92" "out-platforms.log" "bl-pos-platforms" + echo $! >> $PID_LOG_FILE + +} + +function run_titles() { + if [ $binlog == "true" ]; then + rm "bl-pos-collections" + fi + + run_listener "movida_preproduction" "titles" "$TB_DATABASE" "$TITLES_TABLE" "93" "out-titles.log" "bl-pos-titles" + echo $! >> $PID_LOG_FILE +} + +function run_assets() { + if [ $binlog == "true" ]; then + rm "bl-pos-collections" + fi -run_listener "movida_preproduction" "assets" "$TB_DATABASE" "$ASSETS_TABLE" "94" "out-assets.log" "bl-pos-assets" -echo $! >> $PID_LOG_FILE + run_listener "movida_preproduction" "assets" "$TB_DATABASE" "$ASSETS_TABLE" "94" "out-assets.log" "bl-pos-assets" + echo $! >> $PID_LOG_FILE +} + +function run_features() { + if [ $binlog == "true" ]; then + rm "bl-pos-collections" + fi + + run_listener "movida_preproduction" "features" "$TB_DATABASE" "$FEATURES_TABLE" "95" "out-features.log" "bl-pos-features" + echo $! >> $PID_LOG_FILE +} + +function run_collections() { + if [ $binlog == "true" ]; then + rm "bl-pos-collections" + fi + + run_listener "movida_preproduction" "collection_entries" "$TB_DATABASE" "$COLLECTIONS_TABLE" "96" "out-collections.log" "bl-pos-collections" + echo $! >> $PID_LOG_FILE +} + +function usage { + echo "usage: $0 -d datasource [-b clean_binlog]" + echo " -d datasource datasource to syn. Use all for synchronizing all available datasources." + echo " - all" + echo " - schedulings" + echo " - platforms" + echo " - titles" + echo " - assets" + echo " - features" + echo " - collections" + echo " -b clean_binlog clean binlog before running (true | false) False by default" + exit -1 +} -run_listener "movida_preproduction" "features" "$TB_DATABASE" "$FEATURES_TABLE" "95" "out-features.log" "bl-pos-features" -echo $! >> $PID_LOG_FILE +datasource="NONE" +while getopts d:b: flag +do + case "${flag}" in + d) datasource=${OPTARG};; + b) binlog=${OPTARG};; + esac +done -run_listener "movida_preproduction" "collection_entries" "$TB_DATABASE" "$COLLECTIONS_TABLE" "96" "out-collections.log" "bl-pos-collections" -echo $! >> $PID_LOG_FILE +case "${datasource}" in + NONE) usage;; + all) run_schedulings binlog + run_platforms binlog + run_titles binlog + run_assets binlog + run_features binlog + run_collections binlog + ;; + schedulings) run_schedulings binlog;; + platforms) run_platforms binlog;; + titles) run_titles binlog;; + assets) run_assets binlog;; + features) run_features binlog;; + collections) run_collections binlog;; + *) usage;; +esac echo "PID processes in $PID_LOG_FILE" \ No newline at end of file From f58e6b88ce3e9ff24d69f31f26e15b8c8c20113e Mon Sep 17 00:00:00 2001 From: ygnuss Date: Thu, 20 May 2021 13:30:01 +0200 Subject: [PATCH 46/67] Changed update to include in alter table command just those columns which actually change --- clickhouse_mysql/writer/chwriter.py | 18 ++++- requirements.txt | 112 ++-------------------------- 2 files changed, 21 insertions(+), 109 deletions(-) diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index ab29ec4..6cca8ef 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -283,19 +283,33 @@ def update(self, event_or_events): self.dst_table)) # and UPDATE converted rows - + # improve performance updating just those fields which have actually changed + updated_values = dict(set(row['after_values'].items()).difference(set(row['before_values'].items()))) + sql = '' try: + # sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2}, `tb_upd`={3} where {4}'.format( + # schema, + # table, + # ', '.join(filter(None, map(lambda column, value: "" if column in pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), + # "'%s'" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + # ' and '.join(filter(None, map( + # lambda column, value: "" if column not in pk or value is None else self.get_data_format(column, value), + # row['before_values'].keys(), row['before_values'].values()))) + # ) + sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2}, `tb_upd`={3} where {4}'.format( schema, table, - ', '.join(filter(None, map(lambda column, value: "" if column in pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), + ', '.join(filter(None, map(lambda column, value: "" if column in pk or value is None else self.get_data_format(column, value), updated_values.keys(), updated_values.values()))), "'%s'" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), ' and '.join(filter(None, map( lambda column, value: "" if column not in pk or value is None else self.get_data_format(column, value), row['before_values'].keys(), row['before_values'].values()))) ) + logging.debug("SQL UPDATE: \n\n " + sql + "\n\n") + self.client.execute(sql) except Exception as ex: logging.critical('QUERY FAILED') diff --git a/requirements.txt b/requirements.txt index bce8d28..da4173a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,108 +1,6 @@ -appdirs==1.4.4 -apturl==0.5.2 -astroid==2.4.2 -basictracer==3.1.0 -blinker==1.4 -Brlapi==0.7.0 -cachetools==4.0.0 -certifi==2020.12.5 -chardet==3.0.4 -chrome-gnome-shell==0.0.0 -Click==7.0 -clickhouse-driver==0.0.10 -clickhouse-toolset @ file:///tmp/clickhouse_toolset-0.9.dev0-cp38-cp38-linux_x86_64.whl -colorama==0.4.3 -command-not-found==0.3 -configobj==5.0.6 -crcmod==1.7 -cryptography==3.0 -cupshelpers==1.0 -datasketch==1.2.10 -dbus-python==1.2.16 -defer==1.0.6 -distlib==0.3.1 -distro==1.5.0 -distro-info===0.23ubuntu1 -dlib==19.16.0 -filelock==3.0.12 -httplib2==0.18.1 -humanfriendly==8.2 -idna==2.6 -importlib-metadata==1.6.0 -isort==5.7.0 -jeepney==0.4.3 -keyring==21.3.0 -language-selector==0.1 -launchpadlib==1.10.13 -lazr.restfulclient==0.14.2 -lazr.uri==1.0.5 -lazy-object-proxy==1.4.3 -louis==3.14.0 -macaroonbakery==1.3.1 -Markdown==3.2.1 -mccabe==0.6.1 -more-itertools==4.2.0 -msal==1.5.0 -netifaces==0.10.4 -numpy==1.15.0 -oauthlib==3.1.0 -olefile==0.46 -opencv-python==4.4.0.46 -opentracing==2.0.0 -passlib==1.7.1 -pexpect==4.6.0 -Pillow==7.2.0 -powerline-status==2.8.1 -protobuf==3.12.3 -psutil==5.6.3 -psycopg2-binary==2.8.5 -pycairo==1.16.2 -pycups==2.0.1 -pycurl==7.43.0.6 -Pygments==2.3.1 -PyGObject==3.38.0 -PyJWT==1.6.4 -pylint==2.6.0 -pymacaroons==0.13.0 +clickhouse-driver==0.2.0 +mysql-replication==0.23 +mysqlclient==2.0.3 PyMySQL==1.0.2 -PyNaCl==1.4.0 -pyRFC3339==1.1 -python-apt==2.1.3+ubuntu1.3 -python-dateutil==2.8.1 -python-debian==0.1.37 -pytz==2020.1 -pyxdg==0.26 -PyYAML==5.3.1 -rangehttpserver==1.2.0 -redis==3.2.1 -reportlab==3.5.47 -requests==2.18.4 -requests-toolbelt==0.9.1 -requests-unixsocket==0.2.0 -screen-resolution-extra==0.0.0 -SecretStorage==3.1.2 -simplejson==3.17.0 -six==1.15.0 -streaming-form-data==1.1.0 -systemd-python==234 -tabulate==0.8.3 -terminator==1.92 --e git+git@gitlab.com:tinybird/analytics.git@0d13783b7e38c0decc97ac06901e8ce7b804221e#egg=tinybird -tinybird-cli==1.0.0b12 -TLPUI==1.3.1.post3 -toml==0.10.2 -toposort==1.5 -tornado==5.1.1 -tornado-opentracing==1.0.1 -torngithub==0.2.0 -ubuntu-advantage-tools==24.4 -ubuntu-drivers-common==0.0.0 -ufw==0.36 -unattended-upgrades==0.1 -urllib3==1.22 -vboxapi==1.0 -virtualenv==20.0.29+ds -wadllib==1.3.4 -wrapt==1.12.1 -xkit==0.0.0 -zipp==1.0.0 +pytz==2021.1 +tzlocal==2.1 From 081b63294e35b50e15838ff012b9a893f567d7c6 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Tue, 10 Aug 2021 18:59:00 +0200 Subject: [PATCH 47/67] Added support to insert/update and CSV --- .gitignore | 3 +- clickhouse_mysql/clioptions.py | 23 +++ clickhouse_mysql/config.py | 13 +- clickhouse_mysql/event/event.py | 6 +- clickhouse_mysql/reader/mysqlreader.py | 4 +- clickhouse_mysql/writer/chcsvwriter.py | 222 ---------------------- clickhouse_mysql/writer/chwriter.py | 144 ++++++++------- clickhouse_mysql/writer/csvwriter.py | 159 +++++++++++++++- clickhouse_mysql/writer/poolwriter.py | 6 + clickhouse_mysql/writer/processwriter.py | 8 +- clickhouse_mysql/writer/tbcsvwriter.py | 223 +++++++++++++++++++++++ init/dump-tables.sh | 132 -------------- init/first-processing.sh | 18 -- init/run-listeners.sh | 130 ------------- init/stop-listeners.sh | 23 --- init/tb_tables.config | 9 - notes.txt | 3 + 17 files changed, 507 insertions(+), 619 deletions(-) delete mode 100644 clickhouse_mysql/writer/chcsvwriter.py create mode 100644 clickhouse_mysql/writer/tbcsvwriter.py delete mode 100755 init/dump-tables.sh delete mode 100755 init/first-processing.sh delete mode 100755 init/run-listeners.sh delete mode 100755 init/stop-listeners.sh delete mode 100644 init/tb_tables.config create mode 100644 notes.txt diff --git a/.gitignore b/.gitignore index c44086a..1b8fd9a 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,5 @@ _build # Tinibird bl-* -out-* \ No newline at end of file +out-* +.e diff --git a/clickhouse_mysql/clioptions.py b/clickhouse_mysql/clioptions.py index 4be23a2..c46897e 100644 --- a/clickhouse_mysql/clioptions.py +++ b/clickhouse_mysql/clioptions.py @@ -93,6 +93,10 @@ class CLIOptions(Options): # # general app section # + + 'tb_host': 'https://ui.tinybird.co', + 'tb_token': None, + 'config_file': '/etc/clickhouse-mysql/clickhouse-mysql.conf', 'log_file': None, 'log_level': None, @@ -171,6 +175,20 @@ def options(self): # # general app section # + argparser.add_argument( + '--tb-host', + type=str, + default=self.default_options['tb_host'], + help='Tinybird host' + ) + + argparser.add_argument( + '--tb-token', + type=str, + default=self.default_options['tb_token'], + help='Tinybird host' + ) + argparser.add_argument( '--config-file', type=str, @@ -508,6 +526,11 @@ def options(self): # # general app section # + + 'tb_host': args.tb_host, + 'tb_token': args.tb_token, + + 'config_file': args.config_file, 'log_file': args.log_file, 'log_level': args.log_level, diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index e4551c8..1fc9030 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -6,7 +6,7 @@ from clickhouse_mysql.writer.chwriter import CHWriter from clickhouse_mysql.writer.csvwriter import CSVWriter -from clickhouse_mysql.writer.chcsvwriter import CHCSVWriter +from clickhouse_mysql.writer.tbcsvwriter import TBCSVWriter from clickhouse_mysql.writer.poolwriter import PoolWriter from clickhouse_mysql.writer.processwriter import ProcessWriter from clickhouse_mysql.objectbuilder import ObjectBuilder @@ -61,6 +61,10 @@ def __init__(self): # # # + 'tinybird': { + 'host': self.options['tb_host'], + 'token': self.options['tb_token'], + }, 'app': { 'config_file': self.options['config_file'], 'log_file': self.options['log_file'], @@ -359,8 +363,11 @@ def writer_builder_csvpool(self): 'dst_table': self.config['writer']['file']['dst_table'], 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 'next_writer_builder': ObjectBuilder( - class_name=CHCSVWriter, - constructor_params=self.config['writer']['clickhouse'] + class_name=TBCSVWriter, + constructor_params={ + 'tb_host': self.config['tinybird']['host'], + 'tb_token': self.config['tinybird']['token'] + } ), 'converter_builder': self.converter_builder(CONVERTER_CSV), }) diff --git a/clickhouse_mysql/event/event.py b/clickhouse_mysql/event/event.py index e018e57..e38f80b 100644 --- a/clickhouse_mysql/event/event.py +++ b/clickhouse_mysql/event/event.py @@ -64,7 +64,11 @@ def __next__(self): if self.pymysqlreplication_event is not None: # in native replication event actual data are in row['values'] dict item - return item['values'] + if 'after_values' in item: + return item['after_values'] + else: + return item['values'] + else: # local-kept data return item diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 659ab77..040bac0 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -316,7 +316,7 @@ def process_update_rows_event(self, mysql_event): return # statistics - #self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) + self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) if self.subscribers('UpdateRowsEvent'): # dispatch event to subscribers @@ -330,7 +330,7 @@ def process_update_rows_event(self, mysql_event): event.table = mysql_event.table event.pymysqlreplication_event = mysql_event - #self.process_first_event(event=event) + self.process_first_event(event=event) self.notify('UpdateRowsEvent', event=event) # self.stat_write_rows_event_finalyse() diff --git a/clickhouse_mysql/writer/chcsvwriter.py b/clickhouse_mysql/writer/chcsvwriter.py deleted file mode 100644 index 88571c3..0000000 --- a/clickhouse_mysql/writer/chcsvwriter.py +++ /dev/null @@ -1,222 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import os -import logging -import shlex - -from clickhouse_mysql.writer.writer import Writer -from clickhouse_mysql.tableprocessor import TableProcessor - - -class CHCSVWriter(Writer): - """Write into ClickHouse via CSV file and clickhouse-client tool""" - - dst_schema = None - dst_table = None - dst_distribute = None - - host = None - port = None - user = None - password = None - - def __init__( - self, - connection_settings, - dst_schema=None, - dst_table=None, - dst_table_prefix=None, - dst_distribute=False, - ): - if dst_distribute and dst_schema is not None: - dst_schema += "_all" - if dst_distribute and dst_table is not None: - dst_table += "_all" - logging.info( - "CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, - dst_table)) - self.host = connection_settings['host'] - self.port = connection_settings['port'] - self.user = connection_settings['user'] - self.password = connection_settings['password'] - self.dst_schema = dst_schema - self.dst_table = dst_table - self.dst_table_prefix = dst_table_prefix - self.dst_distribute = dst_distribute - - def insert(self, event_or_events=None): - # event_or_events = [ - # event: { - # row: {'id': 3, 'a': 3} - # }, - # event: { - # row: {'id': 3, 'a': 3} - # }, - # ] - - events = self.listify(event_or_events) - if len(events) < 1: - logging.warning('No events to insert. class: %s', __class__) - return - - # assume we have at least one Event - - logging.debug('class:%s insert %d rows', __class__, len(events)) - - for event in events: - schema = self.dst_schema if self.dst_schema else event.schema - table = None - if self.dst_distribute: - table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) - else: - table = self.dst_table if self.dst_table else event.table - if self.dst_schema: - table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) - - sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( - schema, - table, - ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), - ) - - choptions = "" - if self.host: - choptions += " --host=" + shlex.quote(self.host) - if self.port: - choptions += " --port=" + str(self.port) - if self.user: - choptions += " --user=" + shlex.quote(self.user) - if self.password: - choptions += " --password=" + shlex.quote(self.password) - bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( - event.filename, - choptions, - sql, - ) - - logging.info('starting clickhouse-client process') - logging.debug('starting %s', bash) - os.system(bash) - - pass - - def deleteRow(self, event_or_events=None): - # event_or_events = [ - # event: { - # row: {'id': 3, 'a': 3} - # }, - # event: { - # row: {'id': 3, 'a': 3} - # }, - # ] - - events = self.listify(event_or_events) - if len(events) < 1: - logging.warning('No events to delete. class: %s', __class__) - return - - # assume we have at least one Event - - logging.debug('class:%s delete %d rows', __class__, len(events)) - - for event in events: - schema = self.dst_schema if self.dst_schema else event.schema - table = None - if self.dst_distribute: - table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) - else: - table = self.dst_table if self.dst_table else event.table - if self.dst_schema: - table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) - - sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( - schema, - table, - ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), - ) - - choptions = "" - if self.host: - choptions += " --host=" + shlex.quote(self.host) - if self.port: - choptions += " --port=" + str(self.port) - if self.user: - choptions += " --user=" + shlex.quote(self.user) - if self.password: - choptions += " --password=" + shlex.quote(self.password) - bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( - event.filename, - choptions, - sql, - ) - - logging.info('starting clickhouse-client process for delete operation') - logging.debug('starting %s', bash) - os.system(bash) - - pass - - def update(self, event_or_events=None): - # event_or_events = [ - # event: { - # row: {'id': 3, 'a': 3} - # }, - # event: { - # row: {'id': 3, 'a': 3} - # }, - # ] - - logging.info('starting clickhouse-client process for update operation') - - events = self.listify(event_or_events) - if len(events) < 1: - logging.warning('No events to update. class: %s', __class__) - return - - # assume we have at least one Event - - logging.debug('class:%s update %d rows', __class__, len(events)) - - for event in events: - schema = self.dst_schema if self.dst_schema else event.schema - table = None - if self.dst_distribute: - table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) - else: - table = self.dst_table if self.dst_table else event.table - if self.dst_schema: - table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) - - sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( - schema, - table, - ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), - ) - - sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {3}'.format( - schema, - table, - ', '.join(map(lambda column, value: '`%s`=`%s' % column, event.fieldnames, event.fieldnames)) - ) - - choptions = "" - if self.host: - choptions += " --host=" + shlex.quote(self.host) - if self.port: - choptions += " --port=" + str(self.port) - if self.user: - choptions += " --user=" + shlex.quote(self.user) - if self.password: - choptions += " --password=" + shlex.quote(self.password) - bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( - event.filename, - choptions, - sql, - ) - - logging.info('starting clickhouse-client process') - logging.debug('starting %s', bash) - os.system(bash) - - pass diff --git a/clickhouse_mysql/writer/chwriter.py b/clickhouse_mysql/writer/chwriter.py index 6cca8ef..c43ec42 100644 --- a/clickhouse_mysql/writer/chwriter.py +++ b/clickhouse_mysql/writer/chwriter.py @@ -68,15 +68,25 @@ def insert(self, event_or_events=None): event_converted = None for event in events: if not event.verify: - logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) - continue # for event + logging.warning( + 'Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) + continue # for event event_converted = self.convert(event) for row in event_converted: + # These columns are added to identify the last change (tb_upd) and the kind of operation performed + # 0 - INSERT, 1 - UPDATE, 2 - DELETE + row['tb_upd'] = datetime.datetime.now() + row['operation'] = 0 + for key in row.keys(): - # we need to convert Decimal value to str value for suitable for table structure - if type(row[key]) == Decimal: + # we need to convert Decimal or timedelta value to str value for suitable for table structure + if type(row[key]) == [Decimal, datetime.timedelta]: row[key] = str(row[key]) + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + # row['tb_upd'] = datetime.datetime.now() + # row['operation'] = 0 rows.append(row) logging.debug('class:%s insert %d row(s)', __class__, len(rows)) @@ -86,13 +96,16 @@ def insert(self, event_or_events=None): schema = self.dst_schema if self.dst_schema else event_converted.schema table = None if self.dst_distribute: - table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) + table = TableProcessor.create_distributed_table_name( + db=event_converted.schema, table=event_converted.table) else: table = self.dst_table if self.dst_table else event_converted.table if self.dst_schema: - table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + table = TableProcessor.create_migrated_table_name( + prefix=self.dst_table_prefix, table=table) - logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) + logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format( + schema, table, self.dst_schema, self.dst_table)) # and INSERT converted rows @@ -103,6 +116,7 @@ def insert(self, event_or_events=None): table, ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) ) + logging.debug(f"CHWRITER QUERY INSERT: {sql}") self.client.execute(sql, rows) except Exception as ex: logging.critical('QUERY FAILED') @@ -138,7 +152,6 @@ def delete_row(self, event_or_events): rows = [] event_converted = None - pk = None for event in events: if not event.verify: logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), @@ -146,12 +159,20 @@ def delete_row(self, event_or_events): continue # for event event_converted = self.convert(event) - pk = event_converted.pymysqlreplication_event.primary_key for row in event_converted: + # These columns are added to identify the last change (tb_upd) and the kind of operation performed + # 0 - INSERT, 1 - UPDATE, 2 - DELETE + row['tb_upd'] = datetime.datetime.now() + row['operation'] = 2 + for key in row.keys(): - # we need to convert Decimal value to str value for suitable for table structure - if type(row[key]) == Decimal: + # we need to convert Decimal or timedelta value to str value for suitable for table structure + if type(row[key]) in [Decimal, datetime.timedelta]: row[key] = str(row[key]) + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + # row['tb_upd'] = datetime.datetime.now() + # row['operation'] = 2 rows.append(row) logging.debug('class:%s delete %d row(s)', __class__, len(rows)) @@ -161,37 +182,45 @@ def delete_row(self, event_or_events): schema = self.dst_schema if self.dst_schema else event_converted.schema table = None if self.dst_distribute: - table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) + table = TableProcessor.create_distributed_table_name( + db=event_converted.schema, table=event_converted.table) else: table = self.dst_table if self.dst_table else event_converted.table if self.dst_schema: - table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + table = TableProcessor.create_migrated_table_name( + prefix=self.dst_table_prefix, table=table) logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) # and DELETE converted rows - sql = '' - # try: - # sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( - # schema, - # table, - # ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), - # ) - # self.client.execute(sql, rows) + # These columns are added to identify the last change (tb_upd) and the kind of operation performed + # 0 - INSERT, 1 - UPDATE, 2 - DELETE + rows[0]['tb_upd'] = datetime.datetime.now() + rows[0]['operation'] = 2 sql = '' try: - sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2}'.format( + sql = 'INSERT INTO `{0}`.`{1}` ({2}) VALUES'.format( schema, table, - ' and '.join(filter(None, map( - lambda column, value: "" if column != pk else self.get_data_format(column, value), - row.keys(), row.values()))) + ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) ) + logging.debug(f"CHWRITER QUERY DELETE: {sql}") + self.client.execute(sql, rows) - self.client.execute(sql) + # sql = '' + # try: + # sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2}'.format( + # schema, + # table, + # ' and '.join(filter(None, map( + # lambda column, value: "" if column != pk else self.get_data_format(column, value), + # row.keys(), row.values()))) + # ) + # + # self.client.execute(sql) except Exception as ex: logging.critical('QUERY FAILED') @@ -204,6 +233,7 @@ def delete_row(self, event_or_events): """ Get string format pattern for update and delete operations """ + def get_data_format(self, column, value): t = type(value) if t == str: @@ -245,7 +275,6 @@ def update(self, event_or_events): rows = [] event_converted = None - pk = None for event in events: if not event.verify: logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), @@ -253,18 +282,18 @@ def update(self, event_or_events): continue # for event event_converted = self.convert(event) - pk = [event_converted.pymysqlreplication_event.primary_key] - if event_converted.table == 'assets': - pk.append('name') - pk.append('title_id') - pk.append('company_id') - pk.append('asset_type_enumeration_entry_id') for row in event_converted.pymysqlreplication_event.rows: + for key in row['after_values'].keys(): - # we need to convert Decimal value to str value for suitable for table structure - if type(row['after_values'][key]) == Decimal: - row['after_values'][key] = str(row['after_values'][key]) - rows.append(row) + # we need to convert Decimal or timedelta value to str value for suitable for table structure + if type(row['after_values'][key]) in [Decimal, datetime.timedelta]: + row['after_values'][key] = str( + row['after_values'][key]) + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + row['after_values']['tb_upd'] = datetime.datetime.now() + row['after_values']['operation'] = 1 + rows.append(row['after_values']) logging.debug('class:%s update %d row(s)', __class__, len(rows)) @@ -273,55 +302,42 @@ def update(self, event_or_events): schema = self.dst_schema if self.dst_schema else event_converted.schema table = None if self.dst_distribute: - table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) + table = TableProcessor.create_distributed_table_name( + db=event_converted.schema, table=event_converted.table) else: table = self.dst_table if self.dst_table else event_converted.table if self.dst_schema: - table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + table = TableProcessor.create_migrated_table_name( + prefix=self.dst_table_prefix, table=table) logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) # and UPDATE converted rows - # improve performance updating just those fields which have actually changed - updated_values = dict(set(row['after_values'].items()).difference(set(row['before_values'].items()))) - + + # These columns are added to identify the last change (tb_upd) and when a row is deleted (1) + rows[0]['tb_upd'] = datetime.datetime.now() + rows[0]['operation'] = 1 + sql = '' try: - # sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2}, `tb_upd`={3} where {4}'.format( - # schema, - # table, - # ', '.join(filter(None, map(lambda column, value: "" if column in pk or value is None else self.get_data_format(column, value), row['after_values'].keys(), row['after_values'].values()))), - # "'%s'" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - # ' and '.join(filter(None, map( - # lambda column, value: "" if column not in pk or value is None else self.get_data_format(column, value), - # row['before_values'].keys(), row['before_values'].values()))) - # ) - - sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {2}, `tb_upd`={3} where {4}'.format( + sql = 'INSERT INTO `{0}`.`{1}` ({2}) VALUES'.format( schema, table, - ', '.join(filter(None, map(lambda column, value: "" if column in pk or value is None else self.get_data_format(column, value), updated_values.keys(), updated_values.values()))), - "'%s'" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - ' and '.join(filter(None, map( - lambda column, value: "" if column not in pk or value is None else self.get_data_format(column, value), - row['before_values'].keys(), row['before_values'].values()))) + ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) ) - - logging.debug("SQL UPDATE: \n\n " + sql + "\n\n") - - self.client.execute(sql) + logging.debug(f"CHWRITER QUERY UPDATE: {sql}") + self.client.execute(sql, rows) except Exception as ex: logging.critical('QUERY FAILED') logging.critical('ex={}'.format(ex)) logging.critical('sql={}'.format(sql)) + logging.critical('data={}'.format(rows)) # sys.exit(0) # all DONE - - if __name__ == '__main__': connection_settings = { 'host': '192.168.74.230', diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 18cfda6..00c6eaf 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -11,6 +11,10 @@ from clickhouse_mysql.writer.writer import Writer from clickhouse_mysql.event.event import Event +import datetime + +from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent + class CSVWriter(Writer): """Write CSV files""" @@ -89,6 +93,7 @@ def open(self): # open file for write-at-the-end mode self.file = open(self.path, 'a+') + def insert(self, event_or_events): # event_or_events = [ # event: { @@ -118,7 +123,14 @@ def insert(self, event_or_events): logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) return - self.fieldnames = sorted(self.convert(copy.copy(event.first_row())).keys()) + event_converted = self.convert(event) + rows = event_converted.pymysqlreplication_event.rows + headers = list(rows[0]['values'].keys()) + headers.append('operation') + headers.append('tb_upd') + + # self.fieldnames = sorted(self.convert(copy.copy(event.first_row())).keys()) + self.fieldnames = headers if self.dst_schema is None: self.dst_schema = event.schema if self.dst_table is None: @@ -132,21 +144,148 @@ def insert(self, event_or_events): if not event.verify: logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) continue # for event - for row in event: - self.writer.writerow(self.convert(row)) + self.generate_row(event) + + def delete_row(self, event_or_events): + + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] - def deleteRow(self, event_or_events): - """ - TODO - """ logging.debug("Delete CSV Writer") + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to delete. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s delete %d events', __class__, len(events)) + + if not self.opened(): + self.open() + + if not self.writer: + # pick any event from the list + event = events[0] + if not event.verify: + logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) + return + + event_converted = self.convert(event) + rows = event_converted.pymysqlreplication_event.rows + headers = list(rows[0]['values'].keys()) + headers.append('operation') + headers.append('tb_upd') + + self.fieldnames = headers + if self.dst_schema is None: + self.dst_schema = event.schema + if self.dst_table is None: + self.dst_table = event.table + + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) + if not self.header_written: + self.writer.writeheader() + + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) + continue # for event + self.generate_row(event) + + + def update(self, event_or_events): - """ - TODO - """ + + # event_or_events = [ + # event: { + # row: { + # 'before_values': {'id': 3, 'a': 3}, + # 'after_values': {'id': 3, 'a': 2} + # } + # }, + # event: { + # row: { + # 'before_values': {'id': 2, 'a': 3}, + # 'after_values': {'id': 2, 'a': 2} + # } + # }, + # ] + logging.debug("Update CSV Writer") + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to update. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s updated %d events', __class__, len(events)) + + if not self.opened(): + self.open() + + if not self.writer: + # pick any event from the list + event = events[0] + if not event.verify: + logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) + return + + event_converted = self.convert(event) + rows = event_converted.pymysqlreplication_event.rows + headers = list(rows[0]['after_values'].keys()) + headers.append('operation') + headers.append('tb_upd') + + # self.fieldnames = sorted(headers) + self.fieldnames = headers + if self.dst_schema is None: + self.dst_schema = event.schema + if self.dst_table is None: + self.dst_table = event.table + + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) + if not self.header_written: + self.writer.writeheader() + + for event in events: + if not event.verify: + logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) + continue # for event + + event_converted = self.convert(event) + self.generate_row(event_converted) + + + def generate_row(self, event): + """ When using mempool or csvpool events are cached so you can receive different kind of events in the same list. These events should be handled in a different way """ + + if isinstance(event.pymysqlreplication_event, WriteRowsEvent): + for row in event: + row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row['operation'] = 0 + self.writer.writerow(self.convert(row)) + elif isinstance(event.pymysqlreplication_event, DeleteRowsEvent): + for row in event: + row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row['operation'] = 2 + self.writer.writerow(self.convert(row)) + elif isinstance(event.pymysqlreplication_event, UpdateRowsEvent): + for row in event.pymysqlreplication_event.rows: + row['after_values']['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row['after_values']['operation'] = 1 + self.writer.writerow(self.convert(row['after_values'])) + + def push(self): if not self.next_writer_builder or not self.fieldnames: return diff --git a/clickhouse_mysql/writer/poolwriter.py b/clickhouse_mysql/writer/poolwriter.py index 129f05a..071a6ef 100644 --- a/clickhouse_mysql/writer/poolwriter.py +++ b/clickhouse_mysql/writer/poolwriter.py @@ -38,11 +38,17 @@ def insert(self, event_or_events): self.pool.insert(event_or_events) + # TODO delete if delete_row works def delete(self, event_or_events): """Insert delete data into Pool""" logging.debug('class:%s delete', __class__) self.pool.insert(event_or_events) + def delete_row(self, event_or_events): + """Insert delete data into Pool""" + logging.debug('class:%s delete', __class__) + self.pool.insert(event_or_events) + def update(self, event_or_events): """Insert update data into Pool""" logging.debug('class:%s update', __class__) diff --git a/clickhouse_mysql/writer/processwriter.py b/clickhouse_mysql/writer/processwriter.py index 8177345..b3584f2 100644 --- a/clickhouse_mysql/writer/processwriter.py +++ b/clickhouse_mysql/writer/processwriter.py @@ -40,22 +40,22 @@ def processDelete(self, event_or_events=None): logging.debug('class:%s process()', __class__) writer = self.next_writer_builder.get() - writer.deleteRow(event_or_events) + writer.delete_row(event_or_events) writer.close() writer.push() writer.destroy() - logging.debug('class:%s process() done', __class__) + logging.debug('class:%s processDelete() done', __class__) def processUpdate(self, event_or_events=None): """Separate process body to be run""" logging.debug('class:%s process()', __class__) writer = self.next_writer_builder.get() - writer.delete(event_or_events) + writer.update(event_or_events) writer.close() writer.push() writer.destroy() - logging.debug('class:%s process() done', __class__) + logging.debug('class:%s processUpdate() done', __class__) def insert(self, event_or_events=None): # event_or_events = [ diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py new file mode 100644 index 0000000..60482f0 --- /dev/null +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import logging +import shlex + +from clickhouse_mysql.writer.writer import Writer +from clickhouse_mysql.tableprocessor import TableProcessor + +import requests +from requests_toolbelt.multipart.encoder import MultipartEncoder +import json + +class TBCSVWriter(Writer): + """Write into Tinybird via CSV file""" + + dst_schema = None + dst_table = None + dst_distribute = None + + tb_host = None + tb_token = None + + def __init__( + self, + tb_host, + tb_token, + dst_schema=None, + dst_table=None, + dst_table_prefix=None, + dst_distribute=False, + ): + # if dst_distribute and dst_schema is not None: + # dst_schema += "_all" + # if dst_distribute and dst_table is not None: + # dst_table += "_all" + # logging.info( + # "CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, + # dst_table)) + self.tb_host = tb_host + self.tb_token = tb_token + + if self.tb_host is None or self.tb_token is None: + logging.critical(f" Host: {self.tb_host} or token {self.tb_token} is missing") + return None + + self.dst_schema = dst_schema + self.dst_table = dst_table + self.dst_table_prefix = dst_table_prefix + self.dst_distribute = dst_distribute + + def insert(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + events = self.listify(event_or_events) + if len(events) < 1: + logging.warning('No events to insert. class: %s', __class__) + return + + # assume we have at least one Event + + logging.debug('class:%s insert %d rows', __class__, len(events)) + + for event in events: + #schema = self.dst_schema if self.dst_schema else event.schema + #table = self.dst_table if self.dst_table else event.table + + params = { + 'name': self.dst_table, + 'mode': 'append' + } + + f = open(event.filename, 'rb') + m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) + + url = f"{self.tb_host}/v0/datasources" + + response = requests.post(url, data=m, + headers={'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type}, + params=params + ) + + # logging.debug(response.text) + if response.status_code == 200: + json_object = json.loads(response.content) + logging.debug(f"Import id: {json_object['import_id']}") + # logging.debug(f"Response: {json.dumps(json_object, indent=2)}") + + else: + logging.debug(f"ERROR {response.text}") + + pass + + def deleteRow(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # events = self.listify(event_or_events) + # if len(events) < 1: + # logging.warning('No events to delete. class: %s', __class__) + # return + + # # assume we have at least one Event + + # logging.debug('class:%s delete %d rows', __class__, len(events)) + + # for event in events: + # schema = self.dst_schema if self.dst_schema else event.schema + # table = None + # if self.dst_distribute: + # table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) + # else: + # table = self.dst_table if self.dst_table else event.table + # if self.dst_schema: + # table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + # sql = 'ALTER TABLE `{0}`.`{1}` DELETE WHERE {2} = {3} '.format( + # schema, + # table, + # ' AND '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + # ) + + # choptions = "" + # if self.host: + # choptions += " --host=" + shlex.quote(self.host) + # if self.port: + # choptions += " --port=" + str(self.port) + # if self.user: + # choptions += " --user=" + shlex.quote(self.user) + # if self.password: + # choptions += " --password=" + shlex.quote(self.password) + # bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( + # event.filename, + # choptions, + # sql, + # ) + + # logging.info('starting clickhouse-client process for delete operation') + # logging.debug('starting %s', bash) + # os.system(bash) + + logging.debug("CHCSVWriter: delete row") + pass + + def update(self, event_or_events=None): + # event_or_events = [ + # event: { + # row: {'id': 3, 'a': 3} + # }, + # event: { + # row: {'id': 3, 'a': 3} + # }, + # ] + + # logging.info('starting clickhouse-client process for update operation') + + # events = self.listify(event_or_events) + # if len(events) < 1: + # logging.warning('No events to update. class: %s', __class__) + # return + + # # assume we have at least one Event + + # logging.debug('class:%s update %d rows', __class__, len(events)) + + # for event in events: + # schema = self.dst_schema if self.dst_schema else event.schema + # table = None + # if self.dst_distribute: + # table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) + # else: + # table = self.dst_table if self.dst_table else event.table + # if self.dst_schema: + # table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) + + # sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( + # schema, + # table, + # ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), + # ) + + # sql = 'ALTER TABLE `{0}`.`{1}` UPDATE {3}'.format( + # schema, + # table, + # ', '.join(map(lambda column, value: '`%s`=`%s' % column, event.fieldnames, event.fieldnames)) + # ) + + # choptions = "" + # if self.host: + # choptions += " --host=" + shlex.quote(self.host) + # if self.port: + # choptions += " --port=" + str(self.port) + # if self.user: + # choptions += " --user=" + shlex.quote(self.user) + # if self.password: + # choptions += " --password=" + shlex.quote(self.password) + # bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( + # event.filename, + # choptions, + # sql, + # ) + + # logging.info('starting clickhouse-client process') + # logging.debug('starting %s', bash) + # os.system(bash) + + logging.debug("CHCSVWriter: delete row") + + pass diff --git a/init/dump-tables.sh b/init/dump-tables.sh deleted file mode 100755 index 25eb02b..0000000 --- a/init/dump-tables.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -1 -fi - -DUMP_PATH=$1 - -source tb_tables.config - -########### -### titles -########### - -echo "Dumping titles" -mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction titles > $DUMP_PATH/titles.sql - -echo "use $TB_DATABASE;" > $DUMP_PATH/titles-insert-tb.sql -cat $DUMP_PATH/titles.sql | grep "INSERT INTO" >> $DUMP_PATH/titles-insert-tb.sql -sed -i 's/INSERT INTO `titles` VALUES/INSERT INTO `t_8a192b9c7ece4572a5a2fc9858e26d5c` (`id`, `name`, `licensor_id`, `created_at`, `updated_at`, `company_id`, `series_id`, `external_id`, `poster_file_name`, `poster_content_type`, `poster_file_size`, `poster_updated_at`, `episode_number`, `dirty_episode_number`, `rights_count`, `blackouts_count`, `denied_rights_count`, `images_count`, `cover_image_id`, `title_type`, `metadata_updated_at`, `promoted_content_id`, `promoted_content_type`, `soft_destroyed`, `credits_count`, `translated_attributes`, `rules_count`, `discarded`, `episode_reference_id`, `brand_id`) VALUES/g' $DUMP_PATH/titles-insert-tb.sql - -echo "Truncate titles table" -echo "truncate $TB_DATABASE.$TITLES_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn - -echo "Loading titles into CH" -cat $DUMP_PATH/titles-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn -echo "Titles loaded" - -read -p "Press enter to continue" - -########### -### assets -########### - -echo "Dumping assets" -mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction assets > $DUMP_PATH/assets.sql - -echo "use $TB_DATABASE;" > $DUMP_PATH/assets-insert-tb.sql -cat $DUMP_PATH/assets.sql | grep "INSERT INTO" >> $DUMP_PATH/assets-insert-tb.sql -sed -i 's/INSERT INTO `assets` VALUES/INSERT INTO `t_4c03fdeb4e3e4db784ead40b06ec8617` (`id`, `name`, `title_id`, `created_at`, `updated_at`, `description`, `runtime_in_milliseconds`, `metadata_updated_at`, `company_id`, `asset_type_enumeration_entry_id`, `external_id`) VALUES/g' $DUMP_PATH/assets-insert-tb.sql - -echo "Truncate assets table" -echo "truncate $TB_DATABASE.$ASSETS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn - -echo "Loading assets into CH" -cat $DUMP_PATH/assets-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn -echo "Assets loaded" - -read -p "Press enter to continue" - -####################### -### Collection-entries -####################### - -echo "Dumping collection-entries" -mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction collection_entries > $DUMP_PATH/collections.sql - -echo "use $TB_DATABASE;" > $DUMP_PATH/collections-insert-tb.sql -cat $DUMP_PATH/collections.sql | grep "INSERT INTO" >> $DUMP_PATH/collections-insert-tb.sql -sed -i 's/INSERT INTO `collection_entries` VALUES/INSERT INTO `t_3dd7b323438943c687bd4e13a0e181a1` (`collection_id`, `title_id`, `id`, `position`) VALUES/g' $DUMP_PATH/collections-insert-tb.sql - -echo "Truncate collections table" -echo "truncate $TB_DATABASE.$COLLECTIONS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn - -echo "Loading collection-entries into CH" -cat $DUMP_PATH/collections-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn -echo "Collection-entries loaded" - -read -p "Press enter to continue" - -############## -### Features -############## - -echo "Dumping features" -mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction features > $DUMP_PATH/features.sql - -echo "use $TB_DATABASE;" > $DUMP_PATH/features-insert-tb.sql -read -p "Press enter to continue use" -cat $DUMP_PATH/features.sql | grep "INSERT INTO" >> $DUMP_PATH/features-insert-tb.sql -read -p "Press enter to continue insert" -sed -i 's/INSERT INTO `features` VALUES/INSERT INTO `t_23f41723e0eb480088cbb1c8f890a38c` (`id`, `name`, `enabled`, `company_id`, `created_at`, `updated_at`) VALUES/g' $DUMP_PATH/features-insert-tb.sql -read -p "Press enter to continue sed" -echo "Truncate features table" -echo "truncate $TB_DATABASE.$FEATURES_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn - -echo "Loading features into CH" -cat $DUMP_PATH/features-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn -echo "Features loaded" - -read -p "Press enter to continue" - -############## -### Platforms -############## - -echo "Dumping platforms" -mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction platforms > $DUMP_PATH/platforms.sql - -echo "use $TB_DATABASE;" > $DUMP_PATH/platforms-insert-tb.sql -cat $DUMP_PATH/platforms.sql | grep "INSERT INTO" >> $DUMP_PATH/platforms-insert-tb.sql -sed -i 's/INSERT INTO `platforms` VALUES/INSERT INTO `t_83f598dc74254de68216a7c7735caffb` (`id`, `company_id`, `name`, `created_at`, `updated_at`, `sequence_service_titles_url`, `_deprecated_sequence_template_name`, `_deprecated_owned`, `sequence_template_url`, `metadata_constant_name`, `outlet_id`, `automatic_publication_enabled`, `metadata_updated_at`, `granted_categories`, `external_id`, `timezone`) VALUES/g' $DUMP_PATH/platforms-insert-tb.sql - -echo "Truncate platforms table" -echo "truncate $TB_DATABASE.$PLATFORMS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn - -echo "Loading platforms into CH" -cat $DUMP_PATH/platforms-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn -echo "Platforms loaded" - -read -p "Press enter to continue" - -################# -### Schedulings -################# - -echo "Dumping schedulings" -mysqldump --host=127.0.0.1 --port=3307 --user=tinybird --password=goo7eu9AeS3i --single-transaction --quick movida_preproduction schedulings > $DUMP_PATH/schedulings.sql - -echo "use $TB_DATABASE;" > $DUMP_PATH/schedulings-insert-tb.sql -cat $DUMP_PATH/schedulings.sql | grep "INSERT INTO" >> $DUMP_PATH/schedulings-insert-tb.sql -sed -i 's/INSERT INTO `schedulings` VALUES/INSERT INTO `t_b5e541d4e73d4301ba736c427bd667c5` (`id`, `title_id`, `put_up`, `take_down`, `created_at`, `updated_at`, `cleared`, `platform_id`, `rule_id`, `workflow_offset`, `sequence_asset_url`, `sequence_asset_name`, `workflow_sent`, `status`, `asset_id`, `rule_asset_id`, `title_group_id`, `workflow_web_url`, `_deprecated_publication_status`, `published_at`, `_prev_put_up`, `_prev_take_down`, `_pending_simulation`, `workflow_template_url`, `original_draft_scheduling_id`, `playlist_id`, `updating_playlist`, `workflow_job_url`, `workflow_status`, `conflict_types`, `metadata_updated_at`, `company_id`, `cached_title_episode_number`, `metadata_status`, `publication_status`, `publication_status_updated_at`, `metadata_status_updated_at`, `external_id`, `disabled_at`, `scheduling_type`, `overridden_rule_attributes`, `update_in_progress`, `metadata_error_digest`) VALUES/g' $DUMP_PATH/schedulings-insert-tb.sql - -echo "Truncate schedulings table" -echo "truncate $TB_DATABASE.$SCHEDULINGS_TABLE" | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn - -echo "Loading schedulings into CH" -cat $DUMP_PATH/schedulings-insert-tb.sql | ~/tinybird/bin/ch/ch-20.7.2.30/ClickHouse/build/programs/clickhouse-client -mn -echo "Schedulings loaded" - -echo "Process finished!" \ No newline at end of file diff --git a/init/first-processing.sh b/init/first-processing.sh deleted file mode 100755 index 7daa44c..0000000 --- a/init/first-processing.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -1 -fi - -echo "Generate binlog timelog" -./run-listener.sh -./stop-listeners.sh - -echo "Generating dumps and loading data ..." -./dump-tables.sh $1 - -echo "Starting listeners" -./run-listener.sh - -echo "Done!" \ No newline at end of file diff --git a/init/run-listeners.sh b/init/run-listeners.sh deleted file mode 100755 index 21a7502..0000000 --- a/init/run-listeners.sh +++ /dev/null @@ -1,130 +0,0 @@ -#!/bin/bash - -LOG_LEVEL=debug - -SOURCE_HOST=127.0.0.1 -SOURCE_PORT=3307 -DESTINATION_HOST=127.0.0.1 -SOURCE_USER=tinybird -SOURCE_PASSWD=goo7eu9AeS3i - -PID_LOG_FILE=/tmp/listeners-pid.log - -source tb_tables.config - -############################################################ -# Run a process to synchronize MySQL table using binlog. -# -# $1 --> Source schema -# $2 --> Source table -# $3 --> Destination schema -# $4 --> Destination table -# $5 --> Server id -# $6 --> Log file -# $7 --> Binlog position file -# -############################################################# -function run_listener() { - - (clickhouse-mysql --src-server-id=$5 --src-wait --src-resume --binlog-position-file $7 --nice-pause=1 --src-host=$SOURCE_HOST --src-port=$SOURCE_PORT --src-user=$SOURCE_USER --src-password=$SOURCE_PASSWD --src-schemas=$1 --src-tables=$2 --dst-host=$DESTINATION_HOST --dst-schema=$3 --dst-table=$4 --log-level=$LOG_LEVEL --pump-data 2>> $6)& - -} - -function run_schedulings() { - if [ $binlog == "true" ]; then - rm "bl-pos-collections" - fi - - run_listener "movida_preproduction" "schedulings" "$TB_DATABASE" "$SCHEDULINGS_TABLE" "91" "out-schedulings.log" "bl-pos-schedulings" - echo $! > $PID_LOG_FILE - -} - -function run_platforms() { - if [ $binlog == "true" ]; then - rm "bl-pos-collections" - fi - - run_listener "movida_preproduction" "platforms" "$TB_DATABASE" "$PLATFORMS_TABLE" "92" "out-platforms.log" "bl-pos-platforms" - echo $! >> $PID_LOG_FILE - -} - -function run_titles() { - if [ $binlog == "true" ]; then - rm "bl-pos-collections" - fi - - run_listener "movida_preproduction" "titles" "$TB_DATABASE" "$TITLES_TABLE" "93" "out-titles.log" "bl-pos-titles" - echo $! >> $PID_LOG_FILE -} - -function run_assets() { - if [ $binlog == "true" ]; then - rm "bl-pos-collections" - fi - - run_listener "movida_preproduction" "assets" "$TB_DATABASE" "$ASSETS_TABLE" "94" "out-assets.log" "bl-pos-assets" - echo $! >> $PID_LOG_FILE -} - -function run_features() { - if [ $binlog == "true" ]; then - rm "bl-pos-collections" - fi - - run_listener "movida_preproduction" "features" "$TB_DATABASE" "$FEATURES_TABLE" "95" "out-features.log" "bl-pos-features" - echo $! >> $PID_LOG_FILE -} - -function run_collections() { - if [ $binlog == "true" ]; then - rm "bl-pos-collections" - fi - - run_listener "movida_preproduction" "collection_entries" "$TB_DATABASE" "$COLLECTIONS_TABLE" "96" "out-collections.log" "bl-pos-collections" - echo $! >> $PID_LOG_FILE -} - -function usage { - echo "usage: $0 -d datasource [-b clean_binlog]" - echo " -d datasource datasource to syn. Use all for synchronizing all available datasources." - echo " - all" - echo " - schedulings" - echo " - platforms" - echo " - titles" - echo " - assets" - echo " - features" - echo " - collections" - echo " -b clean_binlog clean binlog before running (true | false) False by default" - exit -1 -} - -datasource="NONE" -while getopts d:b: flag -do - case "${flag}" in - d) datasource=${OPTARG};; - b) binlog=${OPTARG};; - esac -done - -case "${datasource}" in - NONE) usage;; - all) run_schedulings binlog - run_platforms binlog - run_titles binlog - run_assets binlog - run_features binlog - run_collections binlog - ;; - schedulings) run_schedulings binlog;; - platforms) run_platforms binlog;; - titles) run_titles binlog;; - assets) run_assets binlog;; - features) run_features binlog;; - collections) run_collections binlog;; - *) usage;; -esac - -echo "PID processes in $PID_LOG_FILE" \ No newline at end of file diff --git a/init/stop-listeners.sh b/init/stop-listeners.sh deleted file mode 100755 index 582e97c..0000000 --- a/init/stop-listeners.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -PID_LOG_FILE=/tmp/listeners-pid.log - -count_processes() { - echo `ps aux | grep clickhouse-mysql-data-reader | wc -l` -} - -total_before=$(count_processes) - -while IFS= read -r line -do - echo "$line" - kill $line -done < "$PID_LOG_FILE" - -total_after=$(count_processes) - -procs=`echo "$total_after - 1" | bc` - -if [ $total_after -gt 1 ]; then - echo "You still have $procs processes running" -fi \ No newline at end of file diff --git a/init/tb_tables.config b/init/tb_tables.config deleted file mode 100644 index be59079..0000000 --- a/init/tb_tables.config +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -TB_DATABASE='d_073c5e' -TITLES_TABLE='t_8a192b9c7ece4572a5a2fc9858e26d5c' -ASSETS_TABLE='t_4c03fdeb4e3e4db784ead40b06ec8617' -COLLECTIONS_TABLE='t_3dd7b323438943c687bd4e13a0e181a1' -FEATURES_TABLE='t_23f41723e0eb480088cbb1c8f890a38c' -PLATFORMS_TABLE='t_83f598dc74254de68216a7c7735caffb' -SCHEDULINGS_TABLE='t_b5e541d4e73d4301ba736c427bd667c5' \ No newline at end of file diff --git a/notes.txt b/notes.txt new file mode 100644 index 0000000..20da4c9 --- /dev/null +++ b/notes.txt @@ -0,0 +1,3 @@ +# Add delete field + +awk -F"," 'BEGIN { OFS = "," } {$45="0"; print}' test.csv > test-out.csv \ No newline at end of file From 4218b4a8dd203d8bd21e022d8018128c82f81618 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Tue, 10 Aug 2021 19:22:15 +0200 Subject: [PATCH 48/67] Fix missing config --- clickhouse_mysql/config.py | 3 ++- clickhouse_mysql/writer/tbcsvwriter.py | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 1fc9030..3103c87 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -366,7 +366,8 @@ def writer_builder_csvpool(self): class_name=TBCSVWriter, constructor_params={ 'tb_host': self.config['tinybird']['host'], - 'tb_token': self.config['tinybird']['token'] + 'tb_token': self.config['tinybird']['token'], + 'dst_table': self.config['writer']['clickhouse']['dst_table'] } ), 'converter_builder': self.converter_builder(CONVERTER_CSV), diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index 60482f0..f2a6d50 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -71,10 +71,9 @@ def insert(self, event_or_events=None): for event in events: #schema = self.dst_schema if self.dst_schema else event.schema - #table = self.dst_table if self.dst_table else event.table - + table = self.dst_table if self.dst_table else event.table params = { - 'name': self.dst_table, + 'name': table, 'mode': 'append' } From 605089208820da10fde07ddf67525377e8751daa Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Tue, 7 Sep 2021 19:52:00 +0300 Subject: [PATCH 49/67] Added shutdown gracefully --- clickhouse_mysql/pumper.py | 16 ++++++++++-- clickhouse_mysql/reader/mysqlreader.py | 21 ++++++++++------ clickhouse_mysql/reader/reader.py | 3 +++ clickhouse_mysql/writer/poolwriter.py | 8 ++++-- clickhouse_mysql/writer/tbcsvwriter.py | 35 +++++++++++++------------- setup.py | 2 ++ 6 files changed, 56 insertions(+), 29 deletions(-) diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index 0b5d0c3..b1ec9df 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -2,18 +2,26 @@ # -*- coding: utf-8 -*- +from clickhouse_mysql.reader.reader import Reader +from clickhouse_mysql.writer.writer import Writer +import signal + + class Pumper(object): """ Pump data - read data from reader and push into writer """ - reader = None - writer = None + reader: Reader = None + writer: Writer = None def __init__(self, reader=None, writer=None): self.reader = reader self.writer = writer + signal.signal(signal.SIGINT, self.exit_gracefully) + signal.signal(signal.SIGTERM, self.exit_gracefully) + if self.reader: # subscribe on reader's event notifications self.reader.subscribe({ @@ -60,6 +68,10 @@ def update_rows_event(self, event=None): :param event: """ self.writer.update(event) + + def exit_gracefully(self): + self.reader.close() + self.writer.close() if __name__ == '__main__': diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 040bac0..79fb0d2 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -29,6 +29,7 @@ class MySQLReader(Reader): resume_stream = None binlog_stream = None nice_pause = 0 + exit_gracefully = False write_rows_event_num = 0 write_rows_event_each_row_num = 0; @@ -389,7 +390,7 @@ def read(self): # fetch events try: - while True: + while not self.exit_gracefully: logging.debug('Check events in binlog stream') self.init_fetch_loop() @@ -423,10 +424,6 @@ def read(self): # after event processed, we need to handle current binlog position self.process_binlog_position(self.binlog_stream.log_file, self.binlog_stream.log_pos) - except KeyboardInterrupt: - # pass SIGINT further - logging.info("SIGINT received. Pass it further.") - raise except Exception as ex: if self.blocking: # we'd like to continue waiting for data @@ -454,8 +451,6 @@ def read(self): self.notify('ReaderIdleEvent') - except KeyboardInterrupt: - logging.info("SIGINT received. Time to exit.") except Exception as ex: logging.warning("Got an exception, handle it") logging.warning(ex) @@ -473,6 +468,18 @@ def read(self): logging.info('len %d', end_timestamp - self.start_timestamp) + def close(self): + self.exit_gracefully = True + try: + self.binlog_stream.close() + except Exception as ex: + logging.warning("Unable to close binlog stream correctly") + logging.warning(ex) + + logging.info("MySQL reader closed") + + + if __name__ == '__main__': connection_settings = { 'host': '127.0.0.1', diff --git a/clickhouse_mysql/reader/reader.py b/clickhouse_mysql/reader/reader.py index c4f5246..107d04a 100644 --- a/clickhouse_mysql/reader/reader.py +++ b/clickhouse_mysql/reader/reader.py @@ -33,3 +33,6 @@ def __init__(self, converter=None, callbacks={}): def read(self): pass + + def close(self): + pass diff --git a/clickhouse_mysql/writer/poolwriter.py b/clickhouse_mysql/writer/poolwriter.py index 071a6ef..5c06fc4 100644 --- a/clickhouse_mysql/writer/poolwriter.py +++ b/clickhouse_mysql/writer/poolwriter.py @@ -37,7 +37,6 @@ def insert(self, event_or_events): logging.debug('class:%s insert', __class__) self.pool.insert(event_or_events) - # TODO delete if delete_row works def delete(self, event_or_events): """Insert delete data into Pool""" @@ -54,10 +53,15 @@ def update(self, event_or_events): logging.debug('class:%s update', __class__) self.pool.insert(event_or_events) - def flush(self): self.pool.flush() + + def close(self): + self.pool.flush() + logging.info("Closed PoolWriter") + + if __name__ == '__main__': path = 'file.csv' diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index f2a6d50..dca3819 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -77,24 +77,23 @@ def insert(self, event_or_events=None): 'mode': 'append' } - f = open(event.filename, 'rb') - m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) - - url = f"{self.tb_host}/v0/datasources" - - response = requests.post(url, data=m, - headers={'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type}, - params=params - ) - - # logging.debug(response.text) - if response.status_code == 200: - json_object = json.loads(response.content) - logging.debug(f"Import id: {json_object['import_id']}") - # logging.debug(f"Response: {json.dumps(json_object, indent=2)}") - - else: - logging.debug(f"ERROR {response.text}") + with open(event.filename, 'rb') as f: + m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) + url = f"{self.tb_host}/v0/datasources" + + response = requests.post(url, data=m, + headers={'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type}, + params=params + ) + + # logging.debug(response.text) + if response.status_code == 200: + json_object = json.loads(response.content) + logging.debug(f"Import id: {json_object['import_id']}") + # logging.debug(f"Response: {json.dumps(json_object, indent=2)}") + + else: + logging.debug(f"ERROR {response.text}") pass diff --git a/setup.py b/setup.py index f5be528..a8b39e2 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,8 @@ 'clickhouse-driver', 'configobj', 'setuptools', + 'requests_toolbelt', + 'requests' ], # cross-platform support for pip to create the appropriate form of executable From 8070102fcf2331c034442d6873e7f9c29e858d53 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Tue, 7 Sep 2021 20:24:29 +0300 Subject: [PATCH 50/67] Moved signal handler to main --- clickhouse_mysql/main.py | 5 +++++ clickhouse_mysql/pumper.py | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/clickhouse_mysql/main.py b/clickhouse_mysql/main.py index 662fd4d..d751573 100644 --- a/clickhouse_mysql/main.py +++ b/clickhouse_mysql/main.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import signal import sys import multiprocessing as mp import logging @@ -145,6 +146,10 @@ def run(self): reader=self.config.reader(), writer=self.config.writer(), ) + + signal.signal(signal.SIGINT, pumper.exit_gracefully) + signal.signal(signal.SIGTERM, pumper.exit_gracefully) + pumper.run() except Exception as ex: diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index b1ec9df..6aadef9 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -19,9 +19,6 @@ def __init__(self, reader=None, writer=None): self.reader = reader self.writer = writer - signal.signal(signal.SIGINT, self.exit_gracefully) - signal.signal(signal.SIGTERM, self.exit_gracefully) - if self.reader: # subscribe on reader's event notifications self.reader.subscribe({ From 5ac5a1b5e9b3b0dbb4392643e8fbe194dd72d6d7 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 10:52:46 +0300 Subject: [PATCH 51/67] Fix issue with MySQL graceful exit --- clickhouse_mysql/pumper.py | 2 +- clickhouse_mysql/reader/mysqlreader.py | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index 6aadef9..7cad8e3 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -66,7 +66,7 @@ def update_rows_event(self, event=None): """ self.writer.update(event) - def exit_gracefully(self): + def exit_gracefully(self, sig, frame): self.reader.close() self.writer.close() diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 79fb0d2..71f0e9a 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -429,11 +429,11 @@ def read(self): # we'd like to continue waiting for data # report and continue cycle logging.warning("Got an exception, skip it in blocking mode") - logging.warning(ex) + logging.exception(ex) else: # do not continue, report error and exit logging.critical("Got an exception, abort it in non-blocking mode") - logging.critical(ex) + logging.exception(ex) sys.exit(1) # all events fetched (or none of them available) @@ -450,16 +450,16 @@ def read(self): time.sleep(self.nice_pause) self.notify('ReaderIdleEvent') - except Exception as ex: logging.warning("Got an exception, handle it") - logging.warning(ex) + logging.exception(ex) try: self.binlog_stream.close() + logging.info("Stop reading from MySQL") except Exception as ex: logging.warning("Unable to close binlog stream correctly") - logging.warning(ex) + logging.exception(ex) end_timestamp = int(time.time()) @@ -470,13 +470,7 @@ def read(self): def close(self): self.exit_gracefully = True - try: - self.binlog_stream.close() - except Exception as ex: - logging.warning("Unable to close binlog stream correctly") - logging.warning(ex) - - logging.info("MySQL reader closed") + logging.info("MySQL should stop in the next loop") From 1912301eda7a013b642c57d0da6851d8d48a3897 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 12:29:04 +0300 Subject: [PATCH 52/67] Add retry in case of 429 --- clickhouse_mysql/writer/tbcsvwriter.py | 55 +++++++++++++++----------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index dca3819..b490bc5 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -4,6 +4,7 @@ import os import logging import shlex +import time from clickhouse_mysql.writer.writer import Writer from clickhouse_mysql.tableprocessor import TableProcessor @@ -50,6 +51,37 @@ def __init__( self.dst_table_prefix = dst_table_prefix self.dst_distribute = dst_distribute + + def uploadCSV(self, table, filename): + params = { + 'name': table, + 'mode': 'append' + } + + with open(filename, 'rb') as f: + m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) + url = f"{self.tb_host}/v0/datasources" + + response = requests.post(url, data=m, + headers={'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type}, + params=params + ) + + # logging.debug(response.text) + if response.status_code == 200: + json_object = json.loads(response.content) + logging.debug(f"Import id: {json_object['import_id']}") + elif response.status_code == 429: + logging.error(f"Too many requests retrying in {response.headers['Retry-After']} seconds", response) + time.sleep(response.headers['Retry-After']) + self.uploadCSV(table, filename) + + else: + logging.error(response.text) + + + + def insert(self, event_or_events=None): # event_or_events = [ # event: { @@ -72,28 +104,7 @@ def insert(self, event_or_events=None): for event in events: #schema = self.dst_schema if self.dst_schema else event.schema table = self.dst_table if self.dst_table else event.table - params = { - 'name': table, - 'mode': 'append' - } - - with open(event.filename, 'rb') as f: - m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) - url = f"{self.tb_host}/v0/datasources" - - response = requests.post(url, data=m, - headers={'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type}, - params=params - ) - - # logging.debug(response.text) - if response.status_code == 200: - json_object = json.loads(response.content) - logging.debug(f"Import id: {json_object['import_id']}") - # logging.debug(f"Response: {json.dumps(json_object, indent=2)}") - - else: - logging.debug(f"ERROR {response.text}") + self.uploadCSV(table, event.filename) pass From a2f3884aa86a2a9765ddbd085df68f91c94b0fda Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 12:51:02 +0300 Subject: [PATCH 53/67] Graceful exit only stops reader --- clickhouse_mysql/pumper.py | 1 - clickhouse_mysql/reader/mysqlreader.py | 1 + clickhouse_mysql/writer/poolwriter.py | 5 ----- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index 7cad8e3..ffffc83 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -68,7 +68,6 @@ def update_rows_event(self, event=None): def exit_gracefully(self, sig, frame): self.reader.close() - self.writer.close() if __name__ == '__main__': diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 71f0e9a..3149489 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -470,6 +470,7 @@ def read(self): def close(self): self.exit_gracefully = True + self.nice_pause = 0 logging.info("MySQL should stop in the next loop") diff --git a/clickhouse_mysql/writer/poolwriter.py b/clickhouse_mysql/writer/poolwriter.py index 5c06fc4..303ed84 100644 --- a/clickhouse_mysql/writer/poolwriter.py +++ b/clickhouse_mysql/writer/poolwriter.py @@ -57,11 +57,6 @@ def flush(self): self.pool.flush() - def close(self): - self.pool.flush() - logging.info("Closed PoolWriter") - - if __name__ == '__main__': path = 'file.csv' From 19e3d2c4d36f7816f3d8e68795b79f4af2b8cb6d Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 13:29:56 +0300 Subject: [PATCH 54/67] cast retry-after to int --- clickhouse_mysql/writer/tbcsvwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index b490bc5..f3cc194 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -73,7 +73,7 @@ def uploadCSV(self, table, filename): logging.debug(f"Import id: {json_object['import_id']}") elif response.status_code == 429: logging.error(f"Too many requests retrying in {response.headers['Retry-After']} seconds", response) - time.sleep(response.headers['Retry-After']) + time.sleep(int(response.headers['Retry-After'])) self.uploadCSV(table, filename) else: From da57a75cbd2a23d0cded73b3d516cbf3c579a0e5 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 14:33:15 +0300 Subject: [PATCH 55/67] Improve retry in TB CSV --- clickhouse_mysql/writer/tbcsvwriter.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index f3cc194..1803ff9 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -52,7 +52,8 @@ def __init__( self.dst_distribute = dst_distribute - def uploadCSV(self, table, filename): + def uploadCSV(self, table, filename, tries=1): + limit_of_retries=3 params = { 'name': table, 'mode': 'append' @@ -68,19 +69,23 @@ def uploadCSV(self, table, filename): ) # logging.debug(response.text) + logging.info(response.json()) if response.status_code == 200: json_object = json.loads(response.content) logging.debug(f"Import id: {json_object['import_id']}") elif response.status_code == 429: - logging.error(f"Too many requests retrying in {response.headers['Retry-After']} seconds", response) - time.sleep(int(response.headers['Retry-After'])) - self.uploadCSV(table, filename) - - else: - logging.error(response.text) - - - + retry_after = int(response.headers['Retry-After']) + tries + logging.error(f"Too many requests retrying in {retry_after} seconds to upload {filename } to {table}") + time.sleep(retry_after) + self.uploadCSV(table, filename, tries+1) + else: + # In case of error let's retry only + logging.exception(response.json()) + time.sleep(tries) + logging.info(f"Retrying { tries } of { limit_of_retries }") + if tries > limit_of_retries: + return + self.uploadCSV(self, table, filename, tries + 1) def insert(self, event_or_events=None): # event_or_events = [ From abc31b14a76a5cf12e04686222642652078af16b Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 16:58:08 +0300 Subject: [PATCH 56/67] If received SIGINT break the loop --- clickhouse_mysql/reader/mysqlreader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 3149489..4286792 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -404,8 +404,9 @@ def read(self): # fetch available events from MySQL for mysql_event in self.binlog_stream: - # new event has come - # check what to do with it + + if self.exit_gracefully: + break logging.debug( 'Got Event ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos)) From f88c5a060cadbef96f7f3943b011270e439a7a3c Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 17:09:31 +0300 Subject: [PATCH 57/67] Improve logging when reading binlog pos --- clickhouse_mysql/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 3103c87..929cf5b 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import logging from clickhouse_mysql.reader.mysqlreader import MySQLReader from clickhouse_mysql.reader.csvreader import CSVReader @@ -50,10 +51,11 @@ def __init__(self): log_file, log_pos )) - except: + except Exception as e: + logging.exception(e) log_file = None log_pos = None - print("can't read binlog position from file {}".format( + logging.info("can't read binlog position from file {}".format( self.options['binlog_position_file'], )) # build application config out of aggregated options From 3c18a0d614d6876280d87be1988e90c752e599a9 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 8 Sep 2021 17:51:42 +0300 Subject: [PATCH 58/67] Validate that binlog file exist before reading it --- clickhouse_mysql/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/config.py b/clickhouse_mysql/config.py index 929cf5b..217aa0b 100644 --- a/clickhouse_mysql/config.py +++ b/clickhouse_mysql/config.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import logging +import os from clickhouse_mysql.reader.mysqlreader import MySQLReader from clickhouse_mysql.reader.csvreader import CSVReader @@ -40,7 +41,7 @@ def __init__(self): log_file = None log_pos = None - if self.options['binlog_position_file'] and self.options.get_bool('src_resume'): + if self.options['binlog_position_file'] and self.options.get_bool('src_resume') and os.path.exists(self.options['binlog_position_file']): try: with open(self.options['binlog_position_file'], 'r') as f: position = f.read() @@ -52,9 +53,9 @@ def __init__(self): log_pos )) except Exception as e: - logging.exception(e) log_file = None log_pos = None + logging.exception(e) logging.info("can't read binlog position from file {}".format( self.options['binlog_position_file'], )) From 5ce22aea29b137396262a34ef4296eaaed843a5c Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Mon, 13 Sep 2021 13:18:35 +0300 Subject: [PATCH 59/67] remove local and test script From c92f49c3cbcc48e8188d24f2e12a46470cd03389 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Tue, 14 Sep 2021 14:50:54 +0300 Subject: [PATCH 60/67] Added try/catch in upload to retry in case of error --- clickhouse_mysql/writer/tbcsvwriter.py | 80 +++++++++++++++----------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index 1803ff9..f7a28f6 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -1,18 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import os import logging -import shlex import time from clickhouse_mysql.writer.writer import Writer -from clickhouse_mysql.tableprocessor import TableProcessor import requests from requests_toolbelt.multipart.encoder import MultipartEncoder import json + class TBCSVWriter(Writer): """Write into Tinybird via CSV file""" @@ -43,7 +41,8 @@ def __init__( self.tb_token = tb_token if self.tb_host is None or self.tb_token is None: - logging.critical(f" Host: {self.tb_host} or token {self.tb_token} is missing") + logging.critical( + f" Host: {self.tb_host} or token {self.tb_token} is missing") return None self.dst_schema = dst_schema @@ -51,41 +50,54 @@ def __init__( self.dst_table_prefix = dst_table_prefix self.dst_distribute = dst_distribute - def uploadCSV(self, table, filename, tries=1): - limit_of_retries=3 + limit_of_retries = 3 params = { 'name': table, 'mode': 'append' } - with open(filename, 'rb') as f: - m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) - url = f"{self.tb_host}/v0/datasources" - - response = requests.post(url, data=m, - headers={'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type}, - params=params - ) - - # logging.debug(response.text) - logging.info(response.json()) - if response.status_code == 200: - json_object = json.loads(response.content) - logging.debug(f"Import id: {json_object['import_id']}") - elif response.status_code == 429: - retry_after = int(response.headers['Retry-After']) + tries - logging.error(f"Too many requests retrying in {retry_after} seconds to upload {filename } to {table}") - time.sleep(retry_after) - self.uploadCSV(table, filename, tries+1) - else: - # In case of error let's retry only - logging.exception(response.json()) - time.sleep(tries) - logging.info(f"Retrying { tries } of { limit_of_retries }") - if tries > limit_of_retries: - return - self.uploadCSV(self, table, filename, tries + 1) + try: + with open(filename, 'rb') as f: + m = MultipartEncoder(fields={'csv': ('csv', f, 'text/csv')}) + url = f"{self.tb_host}/v0/datasources" + + response = requests.post( + url, + data=m, + headers={ + 'Authorization': 'Bearer ' + self.tb_token, + 'Content-Type': m.content_type + }, + params=params) + + # logging.debug(response.text) + logging.info(response.json()) + if response.status_code == 200: + json_object = json.loads(response.content) + logging.debug(f"Import id: {json_object['import_id']}") + elif response.status_code == 429: + retry_after = int(response.headers['Retry-After']) + tries + logging.error( + f"Too many requests retrying in {retry_after} seconds to upload {filename } to {table}") + time.sleep(retry_after) + self.uploadCSV(table, filename, tries+1) + else: + # In case of error let's retry only + logging.exception(response.json()) + time.sleep(tries) + logging.info(f"Retrying { tries } of { limit_of_retries }") + if tries > limit_of_retries: + return + self.uploadCSV(table, filename, tries + 1) + except Exception as e: + logging.exception(e) + # We wait tries^2 sec to try again + time.sleep(tries * tries) + logging.info(f"Retrying { tries } of { limit_of_retries }") + if tries > limit_of_retries: + return + self.uploadCSV(table, filename, tries + 1) def insert(self, event_or_events=None): # event_or_events = [ @@ -166,7 +178,7 @@ def deleteRow(self, event_or_events=None): # logging.info('starting clickhouse-client process for delete operation') # logging.debug('starting %s', bash) # os.system(bash) - + logging.debug("CHCSVWriter: delete row") pass From 80d8816d070dcdae04718f587ae7c59ca7c11e91 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Thu, 23 Sep 2021 10:22:14 +0300 Subject: [PATCH 61/67] Changed CSV writer to use QUOTE_ALL --- clickhouse_mysql/writer/csvwriter.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 00c6eaf..c0e14a1 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -4,7 +4,6 @@ import csv import os.path import logging -import copy import time import uuid @@ -136,9 +135,9 @@ def insert(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) + self.writer = csv.writer(self.file, quoting=csv.QUOTE_ALL) if not self.header_written: - self.writer.writeheader() + self.writer.writerow(self.fieldnames) for event in events: if not event.verify: @@ -190,9 +189,9 @@ def delete_row(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) + self.writer = csv.writer(self.file, quoting=csv.QUOTE_ALL) if not self.header_written: - self.writer.writeheader() + self.writer.writerow(self.fieldnames) for event in events: if not event.verify: @@ -253,9 +252,9 @@ def update(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) + self.writer = csv.writer(self.file, quoting=csv.QUOTE_ALL) if not self.header_written: - self.writer.writeheader() + self.writer.writerow(self.fieldnames) for event in events: if not event.verify: From 79323ed1d552c762160d2440dbafe1922e9a1cab Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Thu, 23 Sep 2021 11:02:10 +0300 Subject: [PATCH 62/67] Undo last change and add QUOTE_ALL to DictWriter --- clickhouse_mysql/writer/csvwriter.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index c0e14a1..b9cf762 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -4,6 +4,7 @@ import csv import os.path import logging +import copy import time import uuid @@ -135,9 +136,9 @@ def insert(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.writer(self.file, quoting=csv.QUOTE_ALL) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL) if not self.header_written: - self.writer.writerow(self.fieldnames) + self.writer.writeheader() for event in events: if not event.verify: @@ -189,9 +190,9 @@ def delete_row(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.writer(self.file, quoting=csv.QUOTE_ALL) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL) if not self.header_written: - self.writer.writerow(self.fieldnames) + self.writer.writeheader() for event in events: if not event.verify: @@ -252,9 +253,9 @@ def update(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.writer(self.file, quoting=csv.QUOTE_ALL) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL) if not self.header_written: - self.writer.writerow(self.fieldnames) + self.writer.writeheader() for event in events: if not event.verify: From 17123e4aab6f24fc7784124d0910ddb139550c42 Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Sat, 25 Sep 2021 18:12:49 +0300 Subject: [PATCH 63/67] Avoiding flushing with every round --- .flake8 | 13 +++++++++++++ clickhouse_mysql/pumper.py | 4 ++-- clickhouse_mysql/reader/mysqlreader.py | 15 ++++++--------- 3 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..420a518 --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +ignore = + ; except + E722, + ; inline regex + W605, + ; long lines + E501, + ; too complex + C901 +max-complexity = 10 +max-line-length = 120 +application-import-names = flake8 \ No newline at end of file diff --git a/clickhouse_mysql/pumper.py b/clickhouse_mysql/pumper.py index ffffc83..245f9c2 100644 --- a/clickhouse_mysql/pumper.py +++ b/clickhouse_mysql/pumper.py @@ -26,7 +26,7 @@ def __init__(self, reader=None, writer=None): 'UpdateRowsEvent': self.update_rows_event, 'DeleteRowsEvent': self.delete_rows_event, # 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, - 'ReaderIdleEvent': self.reader_idle_event, + # 'ReaderIdleEvent': self.reader_idle_event, }) def run(self): @@ -65,7 +65,7 @@ def update_rows_event(self, event=None): :param event: """ self.writer.update(event) - + def exit_gracefully(self, sig, frame): self.reader.close() diff --git a/clickhouse_mysql/reader/mysqlreader.py b/clickhouse_mysql/reader/mysqlreader.py index 4286792..5cb6c5c 100644 --- a/clickhouse_mysql/reader/mysqlreader.py +++ b/clickhouse_mysql/reader/mysqlreader.py @@ -12,7 +12,6 @@ from clickhouse_mysql.event.event import Event from clickhouse_mysql.tableprocessor import TableProcessor from clickhouse_mysql.util import Util -from pymysqlreplication.event import QueryEvent, RotateEvent, FormatDescriptionEvent class MySQLReader(Reader): @@ -32,7 +31,7 @@ class MySQLReader(Reader): exit_gracefully = False write_rows_event_num = 0 - write_rows_event_each_row_num = 0; + write_rows_event_each_row_num = 0 binlog_position_file = None @@ -323,7 +322,7 @@ def process_update_rows_event(self, mysql_event): # dispatch event to subscribers # statistics - #self.stat_write_rows_event_all_rows(mysql_event=mysql_event) + # self.stat_write_rows_event_all_rows(mysql_event=mysql_event) # dispatch Event event = Event() @@ -356,13 +355,13 @@ def process_delete_rows_event(self, mysql_event): return # statistics - #self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) + # self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) if self.subscribers('DeleteRowsEvent'): # dispatch event to subscribers # statistics - #self.stat_write_rows_event_all_rows(mysql_event=mysql_event) + # self.stat_write_rows_event_all_rows(mysql_event=mysql_event) # dispatch Event event = Event() @@ -422,8 +421,8 @@ def read(self): # skip other unhandled events pass - # after event processed, we need to handle current binlog position - self.process_binlog_position(self.binlog_stream.log_file, self.binlog_stream.log_pos) + # after event processed, we need to handle current binlog position + self.process_binlog_position(self.binlog_stream.log_file, self.binlog_stream.log_pos) except Exception as ex: if self.blocking: @@ -468,12 +467,10 @@ def read(self): logging.info('end %d', end_timestamp) logging.info('len %d', end_timestamp - self.start_timestamp) - def close(self): self.exit_gracefully = True self.nice_pause = 0 logging.info("MySQL should stop in the next loop") - if __name__ == '__main__': From 5dc884d1ee571edeef9e0a748cb89cc1c206cebf Mon Sep 17 00:00:00 2001 From: Alejandro Del Amo Date: Wed, 6 Oct 2021 12:28:02 +0200 Subject: [PATCH 64/67] Disabled Verify to avoid SSL checking --- clickhouse_mysql/writer/tbcsvwriter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/writer/tbcsvwriter.py b/clickhouse_mysql/writer/tbcsvwriter.py index f7a28f6..9684f25 100644 --- a/clickhouse_mysql/writer/tbcsvwriter.py +++ b/clickhouse_mysql/writer/tbcsvwriter.py @@ -69,7 +69,8 @@ def uploadCSV(self, table, filename, tries=1): 'Authorization': 'Bearer ' + self.tb_token, 'Content-Type': m.content_type }, - params=params) + params=params, + verify=False) # logging.debug(response.text) logging.info(response.json()) @@ -81,7 +82,7 @@ def uploadCSV(self, table, filename, tries=1): logging.error( f"Too many requests retrying in {retry_after} seconds to upload {filename } to {table}") time.sleep(retry_after) - self.uploadCSV(table, filename, tries+1) + self.uploadCSV(table, filename, tries + 1) else: # In case of error let's retry only logging.exception(response.json()) From 89ad6a6312fcb4f818e9d486840350ee4439b2a0 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Mon, 16 Jan 2023 12:28:50 +0000 Subject: [PATCH 65/67] Generate CSV with QUOTE_MINIMAL --- clickhouse_mysql/writer/csvwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index b9cf762..58fbffc 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -136,7 +136,7 @@ def insert(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_MINIMAL) if not self.header_written: self.writer.writeheader() From 99d627d75df44614686d09a5fc24c8360ebf4e40 Mon Sep 17 00:00:00 2001 From: Alejandro Date: Mon, 16 Jan 2023 14:18:10 +0000 Subject: [PATCH 66/67] Use QUOTE_MINIMAL everywhere --- clickhouse_mysql/writer/csvwriter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 58fbffc..3fb7951 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -190,7 +190,7 @@ def delete_row(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_MINIMAL) if not self.header_written: self.writer.writeheader() @@ -253,7 +253,7 @@ def update(self, event_or_events): if self.dst_table is None: self.dst_table = event.table - self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_ALL) + self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames, quoting=csv.QUOTE_MINIMAL) if not self.header_written: self.writer.writeheader() From afeee2992665c6c0f7dfb8495d0e1ddb24bb614a Mon Sep 17 00:00:00 2001 From: YIMAN Date: Wed, 18 Jan 2023 12:16:55 +0100 Subject: [PATCH 67/67] change format for tb_upd column --- clickhouse_mysql/writer/csvwriter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clickhouse_mysql/writer/csvwriter.py b/clickhouse_mysql/writer/csvwriter.py index 3fb7951..34bd096 100644 --- a/clickhouse_mysql/writer/csvwriter.py +++ b/clickhouse_mysql/writer/csvwriter.py @@ -271,17 +271,17 @@ def generate_row(self, event): if isinstance(event.pymysqlreplication_event, WriteRowsEvent): for row in event: - row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") row['operation'] = 0 self.writer.writerow(self.convert(row)) elif isinstance(event.pymysqlreplication_event, DeleteRowsEvent): for row in event: - row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") row['operation'] = 2 self.writer.writerow(self.convert(row)) elif isinstance(event.pymysqlreplication_event, UpdateRowsEvent): for row in event.pymysqlreplication_event.rows: - row['after_values']['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row['after_values']['tb_upd'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") row['after_values']['operation'] = 1 self.writer.writerow(self.convert(row['after_values']))