|
4 | 4 | https://github.com/toddwschneider/nyc-taxi-data
|
5 | 5 | """
|
6 | 6 | import concurrent.futures
|
| 7 | +import gzip |
7 | 8 | import io
|
8 | 9 | import multiprocessing
|
9 | 10 | from collections import OrderedDict
|
@@ -92,10 +93,10 @@ def parse_row(row: OrderedDict):
|
92 | 93 |
|
93 | 94 | return Point("taxi-trip-data") \
|
94 | 95 | .tag("dispatching_base_num", row['dispatching_base_num']) \
|
95 |
| - .tag("PULocationID", row['PULocationID']) \ |
96 |
| - .tag("DOLocationID", row['DOLocationID']) \ |
| 96 | + .tag("PULocationID", row['PUlocationID']) \ |
| 97 | + .tag("DOLocationID", row['DOlocationID']) \ |
97 | 98 | .tag("SR_Flag", row['SR_Flag']) \
|
98 |
| - .field("dropoff_datetime", row['dropoff_datetime']) \ |
| 99 | + .field("dropoff_datetime", row['dropOff_datetime']) \ |
99 | 100 | .time(row['pickup_datetime']) \
|
100 | 101 | .to_line_protocol()
|
101 | 102 |
|
@@ -141,80 +142,79 @@ def init_counter(counter, progress, queue):
|
141 | 142 | progress_ = Value('i', 0)
|
142 | 143 | startTime = datetime.now()
|
143 | 144 |
|
144 |
| - url = "https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2019-01.csv" |
145 |
| - # url = "file:///Users/bednar/Developer/influxdata/influxdb-client-python/examples/fhv_tripdata_2019-01.csv" |
| 145 | + url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-01.csv.gz" |
146 | 146 |
|
147 | 147 | """
|
148 | 148 | Open URL and for stream data
|
149 | 149 | """
|
150 | 150 | response = urlopen(url)
|
151 | 151 | if response.headers:
|
152 | 152 | content_length = response.headers['Content-length']
|
153 |
| - io_wrapper = ProgressTextIOWrapper(response) |
154 |
| - io_wrapper.progress = progress_ |
155 | 153 |
|
156 | 154 | """
|
157 |
| - Start writer as a new process |
| 155 | + Open GZIP stream |
158 | 156 | """
|
159 |
| - writer = InfluxDBWriter(queue_) |
160 |
| - writer.start() |
| 157 | + with gzip.open(response, 'rb') as stream: |
| 158 | + io_wrapper = ProgressTextIOWrapper(stream, encoding='utf-8') |
| 159 | + io_wrapper.progress = progress_ |
161 | 160 |
|
162 |
| - """ |
163 |
| - Create process pool for parallel encoding into LineProtocol |
164 |
| - """ |
165 |
| - cpu_count = multiprocessing.cpu_count() |
166 |
| - with concurrent.futures.ProcessPoolExecutor(cpu_count, initializer=init_counter, |
167 |
| - initargs=(counter_, progress_, queue_)) as executor: |
168 | 161 | """
|
169 |
| - Converts incoming HTTP stream into sequence of LineProtocol |
| 162 | + Start writer as a new process |
170 | 163 | """
|
171 |
| - data = rx \ |
172 |
| - .from_iterable(DictReader(io_wrapper)) \ |
173 |
| - .pipe(ops.buffer_with_count(10_000), |
174 |
| - # Parse 10_000 rows into LineProtocol on subprocess |
175 |
| - ops.flat_map(lambda rows: executor.submit(parse_rows, rows, content_length))) |
| 164 | + writer = InfluxDBWriter(queue_) |
| 165 | + writer.start() |
176 | 166 |
|
177 | 167 | """
|
178 |
| - Write data into InfluxDB |
| 168 | + Create process pool for parallel encoding into LineProtocol |
179 | 169 | """
|
180 |
| - data.subscribe(on_next=lambda x: None, on_error=lambda ex: print(f'Unexpected error: {ex}')) |
181 |
| - |
182 |
| - """ |
183 |
| - Terminate Writer |
184 |
| - """ |
185 |
| - queue_.put(None) |
186 |
| - queue_.join() |
| 170 | + cpu_count = multiprocessing.cpu_count() |
| 171 | + with concurrent.futures.ProcessPoolExecutor(cpu_count, initializer=init_counter, |
| 172 | + initargs=(counter_, progress_, queue_)) as executor: |
| 173 | + """ |
| 174 | + Converts incoming HTTP stream into sequence of LineProtocol |
| 175 | + """ |
| 176 | + data = rx \ |
| 177 | + .from_iterable(DictReader(io_wrapper)) \ |
| 178 | + .pipe(ops.buffer_with_count(10_000), |
| 179 | + # Parse 10_000 rows into LineProtocol on subprocess |
| 180 | + ops.map(lambda rows: executor.submit(parse_rows, rows, content_length))) |
| 181 | + |
| 182 | + """ |
| 183 | + Write data into InfluxDB |
| 184 | + """ |
| 185 | + data.subscribe(on_next=lambda x: None, on_error=lambda ex: print(f'Unexpected error: {ex}')) |
187 | 186 |
|
188 |
| - print() |
189 |
| - print(f'Import finished in: {datetime.now() - startTime}') |
190 |
| - print() |
191 |
| - |
192 |
| - """ |
193 |
| - Querying 10 pickups from dispatching 'B00008' |
194 |
| - """ |
195 |
| - query = 'from(bucket:"my-bucket")' \ |
196 |
| - '|> range(start: 2019-01-01T00:00:00Z, stop: now()) ' \ |
197 |
| - '|> filter(fn: (r) => r._measurement == "taxi-trip-data")' \ |
198 |
| - '|> filter(fn: (r) => r.dispatching_base_num == "B00008")' \ |
199 |
| - '|> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")' \ |
200 |
| - '|> rename(columns: {_time: "pickup_datetime"})' \ |
201 |
| - '|> drop(columns: ["_start", "_stop"])|> limit(n:10, offset: 0)' |
202 |
| - |
203 |
| - client = InfluxDBClient(url="http://localhost:8086", token="my-token", org="my-org", debug=False) |
204 |
| - result = client.query_api().query(query=query) |
| 187 | + """ |
| 188 | + Terminate Writer |
| 189 | + """ |
| 190 | + queue_.put(None) |
| 191 | + queue_.join() |
205 | 192 |
|
206 |
| - """ |
207 |
| - Processing results |
208 |
| - """ |
209 |
| - print() |
210 |
| - print("=== Querying 10 pickups from dispatching 'B00008' ===") |
211 |
| - print() |
212 |
| - for table in result: |
213 |
| - for record in table.records: |
214 |
| - print( |
215 |
| - f'Dispatching: {record["dispatching_base_num"]} pickup: {record["pickup_datetime"]} dropoff: {record["dropoff_datetime"]}') |
| 193 | + print() |
| 194 | + print(f'Import finished in: {datetime.now() - startTime}') |
| 195 | + print() |
216 | 196 |
|
217 |
| - """ |
218 |
| - Close client |
219 |
| - """ |
220 |
| - client.close() |
| 197 | + """ |
| 198 | + Querying 10 pickups from dispatching 'B00008' |
| 199 | + """ |
| 200 | + query = 'from(bucket:"my-bucket")' \ |
| 201 | + '|> range(start: 2019-01-01T00:00:00Z, stop: now()) ' \ |
| 202 | + '|> filter(fn: (r) => r._measurement == "taxi-trip-data")' \ |
| 203 | + '|> filter(fn: (r) => r.dispatching_base_num == "B00008")' \ |
| 204 | + '|> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")' \ |
| 205 | + '|> rename(columns: {_time: "pickup_datetime"})' \ |
| 206 | + '|> drop(columns: ["_start", "_stop"])|> limit(n:10, offset: 0)' |
| 207 | + |
| 208 | + with InfluxDBClient(url="http://localhost:8086", token="my-token", org="my-org", debug=False) as client: |
| 209 | + result = client.query_api().query(query=query) |
| 210 | + |
| 211 | + """ |
| 212 | + Processing results |
| 213 | + """ |
| 214 | + print() |
| 215 | + print("=== Querying 10 pickups from dispatching 'B00008' ===") |
| 216 | + print() |
| 217 | + for table in result: |
| 218 | + for record in table.records: |
| 219 | + print( |
| 220 | + f'Dispatching: {record["dispatching_base_num"]} pickup: {record["pickup_datetime"]} dropoff: {record["dropoff_datetime"]}') |
0 commit comments