Skip to content

Commit 6bce3af

Browse files
committed
better csv ingestion, with tests and docs
1 parent e5f3ccd commit 6bce3af

File tree

12 files changed

+162
-15
lines changed

12 files changed

+162
-15
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,17 @@ insert into files (content) values (sqlpage.read_file_as_data_url(sqlpage.upload
3030
returning 'text' as component, 'Uploaded new file with id: ' || id as contents;
3131
```
3232

33+
The maximum size of uploaded files is configurable with the [`max_uploaded_file_size`](./configuration.md) configuration parameter. By default, it is set to 5 MiB.
34+
3335
#### Parsing CSV files
3436

3537
SQLPage can also parse uploaded CSV files and insert them directly into a database table.
36-
SQLPage re-uses PostgreSQL's `COPY` statement to import the CSV file into the database, but makes it work with any database, by emulating the same behavior with simple `INSERT` statements.
38+
SQLPage re-uses PostgreSQL's [`COPY` syntax](https://www.postgresql.org/docs/current/sql-copy.html)
39+
to import the CSV file into the database.
40+
When connected to a PostgreSQL database, SQLPage will use the native `COPY` statement,
41+
for super fast and efficient on-database CSV parsing.
42+
But it will also work with any other database as well, by
43+
parsing the CSV locally and emulating the same behavior with simple `INSERT` statements.
3744

3845
`user_file_upload.sql` :
3946
```sql

configuration.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ on a [JSON](https://en.wikipedia.org/wiki/JSON) file placed in `sqlpage/sqlpage.
1616
| `sqlite_extensions` | | An array of SQLite extensions to load, such as `mod_spatialite` |
1717
| `web_root` | `.` | The root directory of the web server, where the `index.sql` file is located. |
1818
| `allow_exec` | false | Allow usage of the `sqlpage.exec` function. Do this only if all users with write access to sqlpage query files and to the optional `sqlpage_files` table on the database are trusted. |
19-
| `max_uploaded_file_size` | 10485760 | Maximum size of uploaded files in bytes. Defaults to 10 MiB. |
19+
| `max_uploaded_file_size` | 5242880 | Maximum size of uploaded files in bytes. Defaults to 5 MiB. |
2020
| `https_domain` | | Domain name to request a certificate for. Setting this parameter will automatically make SQLPage listen on port 443 and request an SSL certificate. The server will take a little bit longer to start the first time it has to request a certificate. |
2121
| `https_certificate_email` | contact@<https_domain> | The email address to use when requesting a certificate. |
2222
| `https_certificate_cache_dir` | ./sqlpage/https | A writeable directory where to cache the certificates, so that SQLPage can serve https traffic immediately when it restarts. |
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
-- temporarily store the data in a table with text columns
2+
create temporary table if not exists product_tmp(name text, description text, price text);
3+
delete from product_tmp;
4+
5+
-- copy the data from the CSV file into the temporary table
6+
copy product_tmp(name, description, price) from 'product_data_file';
7+
8+
select 'table' as component;
9+
select * from product_tmp;

examples/official-site/sqlpage/migrations/01_documentation.sql

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,58 @@ INSERT INTO uploaded_file(name, data) VALUES(:filename, sqlpage.uploaded_file_da
394394
',
395395
json('[{"component":"form", "title": "Upload a picture", "validate": "Upload", "action": "examples/handle_picture_upload.sql"},
396396
{"name": "my_file", "type": "file", "accept": "image/png, image/jpeg", "label": "Picture", "description": "Upload a nice picture", "required": true}
397+
]')),
398+
('form', '
399+
## Bulk data insertion
400+
401+
You can use the `file` type to allow the user to upload a [CSV](https://en.wikipedia.org/wiki/Comma-separated_values)
402+
file containing data to insert in a table.
403+
404+
SQLPage can load data from a CSV file and insert it into a database table.
405+
SQLPage re-uses PostgreSQL''s [`COPY` syntax](https://www.postgresql.org/docs/current/sql-copy.html)
406+
to specify the format of the CSV file, but makes it work with all supported databases.
407+
408+
> When connected to a PostgreSQL database, SQLPage will use the native `COPY` statement,
409+
> for super fast and efficient on-database CSV parsing.
410+
> But it will also work transparently with other databases, by
411+
> parsing the CSV locally and emulating the same behavior with simple `INSERT` statements.
412+
413+
Here is how you could easily copy data from a CSV to a table in the database:
414+
415+
```sql
416+
copy product(name, description) from ''product_data_input''
417+
with (header true, delimiter '','', quote ''"'');
418+
```
419+
420+
If you want to pre-process the data before inserting it into the final table,
421+
you can use a temporary table to store the data, and then insert it into the final table:
422+
423+
```sql
424+
-- temporarily store the data in a table with text columns
425+
create temporary table if not exists product_tmp(name text, description text, price text);
426+
delete from product_tmp;
427+
428+
-- copy the data from the CSV file into the temporary table
429+
copy product_tmp(name, description, price) from ''product_data_input'';
430+
431+
-- insert the data into the final table, converting the price column to an integer
432+
insert into product(name, description, price)
433+
select name, description, CAST(price AS integer) from product_tmp
434+
where price is not null and description is not null and length(description) > 10;
435+
```
436+
437+
This will load the processed CSV into the product table, provided it has the following structure:
438+
439+
```csv
440+
name,description,price
441+
"SQLPage","A tool to create websites using SQL",0
442+
"PostgreSQL","A powerful open-source relational database",0
443+
"SQLite","A lightweight relational database",0
444+
"MySQL","A popular open-source relational database",0
445+
```
446+
',
447+
json('[{"component":"form", "title": "CSV import", "validate": "Load data", "action": "examples/handle_csv_upload.sql"},
448+
{"name": "product_data_input", "type": "file", "accept": "text/csv", "label": "Products", "description": "Upload a CSV with a name, description, and price columns", "required": true}
397449
]'))
398450
;
399451

examples/official-site/sqlpage/sqlpage.json

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# The documentation site is fully static, so we don't need to persist any data.
2+
database_url": "sqlite::memory:"
3+
4+
# We have a file upload example, and would like to limit the size of the uploaded files
5+
max_uploaded_file_size: 256000

src/app_config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ fn default_web_root() -> PathBuf {
163163
}
164164

165165
fn default_max_file_size() -> usize {
166-
10 * 1024 * 1024
166+
5 * 1024 * 1024
167167
}
168168

169169
fn default_https_certificate_cache_dir() -> PathBuf {

src/webserver/database/csv_import.rs

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ use sqlparser::ast::{
66
CopyLegacyCsvOption, CopyLegacyOption, CopyOption, CopySource, CopyTarget, Statement,
77
};
88
use sqlx::{
9-
any::{AnyArguments, AnyKind},
10-
AnyConnection, Arguments, Executor,
9+
any::{AnyArguments, AnyConnectionKind, AnyKind},
10+
AnyConnection, Arguments, Executor, PgConnection,
1111
};
1212
use tokio::io::AsyncRead;
1313

@@ -80,7 +80,7 @@ impl<'a> CopyCsvOption<'a> {
8080
}
8181
}
8282

83-
pub fn extract_csv_copy_statement(stmt: &mut Statement) -> Option<CsvImport> {
83+
pub(super) fn extract_csv_copy_statement(stmt: &mut Statement) -> Option<CsvImport> {
8484
if let Statement::Copy {
8585
source: CopySource::Table {
8686
table_name,
@@ -137,7 +137,6 @@ pub fn extract_csv_copy_statement(stmt: &mut Statement) -> Option<CsvImport> {
137137
uploaded_file,
138138
})
139139
} else {
140-
log::warn!("COPY statement not compatible with SQLPage: {stmt}");
141140
None
142141
}
143142
}
@@ -157,10 +156,39 @@ pub(super) async fn run_csv_import(
157156
.await
158157
.with_context(|| "opening csv")?;
159158
let buffered = tokio::io::BufReader::new(file);
160-
run_csv_import_on_path(db, csv_import, buffered).await
159+
// private_get_mut is not supposed to be used outside of sqlx, but it is the only way to
160+
// access the underlying connection
161+
match db.private_get_mut() {
162+
AnyConnectionKind::Postgres(pg_connection) => {
163+
run_csv_import_postgres(pg_connection, csv_import, buffered).await
164+
}
165+
_ => run_csv_import_insert(db, csv_import, buffered).await,
166+
}
167+
.with_context(|| {
168+
format!(
169+
"running CSV import from {} to {}",
170+
csv_import.uploaded_file, csv_import.table_name
171+
)
172+
})
173+
}
174+
175+
/// This function does not parse the CSV file, it only sends it to postgres.
176+
/// This is the fastest way to import a CSV file into postgres
177+
async fn run_csv_import_postgres(
178+
db: &mut PgConnection,
179+
csv_import: &CsvImport,
180+
file: impl AsyncRead + Unpin + Send,
181+
) -> anyhow::Result<()> {
182+
let mut copy_transact = db
183+
.copy_in_raw(csv_import.query.as_str())
184+
.await
185+
.with_context(|| "running COPY IN")?;
186+
copy_transact.read_from(file).await?;
187+
copy_transact.finish().await?;
188+
Ok(())
161189
}
162190

163-
async fn run_csv_import_on_path(
191+
async fn run_csv_import_insert(
164192
db: &mut AnyConnection,
165193
csv_import: &CsvImport,
166194
file: impl AsyncRead + Unpin + Send,
@@ -287,13 +315,27 @@ async fn test_end_to_end() {
287315

288316
let mut copy_stmt = sqlparser::parser::Parser::parse_sql(
289317
&sqlparser::dialect::GenericDialect {},
290-
"COPY my_table (col1, col2) FROM 'my_file.csv' WITH (DELIMITER ';', HEADER)",
318+
"COPY my_table (col1, col2) FROM 'my_file.csv' (DELIMITER ';', HEADER)",
291319
)
292320
.unwrap()
293321
.into_iter()
294322
.next()
295323
.unwrap();
296324
let csv_import = extract_csv_copy_statement(&mut copy_stmt).unwrap();
325+
assert_eq!(
326+
csv_import,
327+
CsvImport {
328+
query: "COPY my_table (col1, col2) FROM STDIN (DELIMITER ';', HEADER)".into(),
329+
table_name: "my_table".into(),
330+
columns: vec!["col1".into(), "col2".into()],
331+
delimiter: Some(';'),
332+
quote: None,
333+
header: Some(true),
334+
null_str: None,
335+
escape: None,
336+
uploaded_file: "my_file.csv".into(),
337+
}
338+
);
297339
let mut conn = "sqlite::memory:"
298340
.parse::<sqlx::any::AnyConnectOptions>()
299341
.unwrap()
@@ -305,7 +347,7 @@ async fn test_end_to_end() {
305347
.unwrap();
306348
let csv = "col2;col1\na;b\nc;d"; // order is different from the table
307349
let file = csv.as_bytes();
308-
run_csv_import_on_path(&mut conn, &csv_import, file)
350+
run_csv_import_insert(&mut conn, &csv_import, file)
309351
.await
310352
.unwrap();
311353
let rows: Vec<(String, String)> = sqlx::query_as("SELECT * FROM my_table")

src/webserver/database/sql_to_json.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ pub fn sql_nonnull_to_json<'r>(mut get_ref: impl FnMut() -> sqlx::any::AnyValueR
9292

9393
/// Takes the first column of a row and converts it to a string.
9494
pub fn row_to_string(row: &AnyRow) -> Option<String> {
95-
let col = row.columns().get(0)?;
95+
let col = row.columns().first()?;
9696
match sql_to_json(row, col) {
9797
serde_json::Value::String(s) => Some(s),
9898
serde_json::Value::Null => None,

src/webserver/http.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use actix_web::dev::{fn_service, ServiceFactory, ServiceRequest};
77
use actix_web::error::ErrorInternalServerError;
88
use actix_web::http::header::{ContentType, Header, HttpDate, IfModifiedSince, LastModified};
99
use actix_web::http::{header, StatusCode, Uri};
10+
use actix_web::web::PayloadConfig;
1011
use actix_web::{
1112
dev::ServiceResponse, middleware, middleware::Logger, web, web::Bytes, App, HttpResponse,
1213
HttpServer,
@@ -476,6 +477,7 @@ pub fn create_app(
476477
.wrap(middleware::NormalizePath::new(
477478
middleware::TrailingSlash::MergeOnly,
478479
))
480+
.app_data(PayloadConfig::default().limit(app_state.config.max_uploaded_file_size * 2))
479481
.app_data(app_state)
480482
}
481483

tests/index.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,34 @@ async fn test_file_upload() -> actix_web::Result<()> {
112112
Ok(())
113113
}
114114

115+
#[actix_web::test]
116+
async fn test_csv_upload() -> actix_web::Result<()> {
117+
let req = get_request_to("/tests/upload_csv_test.sql")
118+
.await?
119+
.insert_header(("content-type", "multipart/form-data; boundary=1234567890"))
120+
.set_payload(
121+
"--1234567890\r\n\
122+
Content-Disposition: form-data; name=\"prices_file\"; filename=\"prices.csv\"\r\n\
123+
Content-Type: text/csv\r\n\
124+
\r\n\
125+
price,quantity\r\n\
126+
1,5\r\n\
127+
2.5,4\r\n\
128+
--1234567890--\r\n",
129+
)
130+
.to_srv_request();
131+
let resp = main_handler(req).await?;
132+
133+
assert_eq!(resp.status(), StatusCode::OK);
134+
let body = test::read_body(resp).await;
135+
let body_str = String::from_utf8(body.to_vec()).unwrap();
136+
assert!(
137+
body_str.contains("total: 15"),
138+
"{body_str}\nexpected to contain: total: 15"
139+
);
140+
Ok(())
141+
}
142+
115143
async fn get_request_to(path: &str) -> actix_web::Result<TestRequest> {
116144
init_log();
117145
let config = test_config();

tests/upload_csv_test.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
create table bill(quantity text, price text);
2+
copy bill(quantity, price) from 'prices_file' with (format csv, header true);
3+
select 'text' as component,
4+
'total: ' || sum(cast(quantity as float) * cast(price as float)) as contents
5+
from bill;

0 commit comments

Comments
 (0)