Skip to content

Commit a4d9ef2

Browse files
committed
fix(series import): a page's charset is respected now instead of assuming that it's always UTF-8.
Fix #1237
1 parent 0018ad4 commit a4d9ef2

File tree

4 files changed

+66
-3
lines changed

4 files changed

+66
-3
lines changed

src/main/java/ru/mystamps/web/feature/series/DownloadResult.java

+34-3
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,38 @@
2222
import lombok.RequiredArgsConstructor;
2323
import org.apache.commons.lang3.ArrayUtils;
2424
import org.apache.commons.lang3.StringUtils;
25+
import org.slf4j.Logger;
26+
import org.slf4j.LoggerFactory;
27+
import org.springframework.http.MediaType;
2528

29+
import java.nio.charset.Charset;
2630
import java.nio.charset.StandardCharsets;
2731

2832
@Getter
2933
@RequiredArgsConstructor(access = AccessLevel.PRIVATE)
3034
public class DownloadResult {
35+
private static final Logger LOG = LoggerFactory.getLogger(DownloadResult.class);
36+
3137
private final Code code;
3238
private final byte[] data;
3339
private final String contentType;
40+
private final Charset charset;
3441

3542
public static DownloadResult failed(Code code) {
36-
return new DownloadResult(code, ArrayUtils.EMPTY_BYTE_ARRAY, StringUtils.EMPTY);
43+
return new DownloadResult(
44+
code,
45+
ArrayUtils.EMPTY_BYTE_ARRAY,
46+
StringUtils.EMPTY,
47+
StandardCharsets.UTF_8
48+
);
3749
}
3850

3951
public static DownloadResult succeeded(byte[] data, String contentType) {
40-
return new DownloadResult(Code.SUCCESS, data, contentType);
52+
Charset charset = extractCharset(contentType);
53+
if (charset == null) {
54+
charset = StandardCharsets.UTF_8;
55+
}
56+
return new DownloadResult(Code.SUCCESS, data, contentType, charset);
4157
}
4258

4359
public boolean hasFailed() {
@@ -49,7 +65,7 @@ public boolean hasSucceeded() {
4965
}
5066

5167
public String getDataAsString() {
52-
return new String(data, StandardCharsets.UTF_8);
68+
return new String(data, charset);
5369
}
5470

5571
public enum Code {
@@ -62,4 +78,19 @@ public enum Code {
6278
UNEXPECTED_ERROR,
6379
}
6480

81+
private static Charset extractCharset(String contentType) {
82+
try {
83+
MediaType mediaType = MediaType.parseMediaType(contentType);
84+
return mediaType.getCharset();
85+
86+
} catch (IllegalArgumentException ex) {
87+
// MediaType.parseMediaType() might throw InvalidMediaTypeException.
88+
// MediaType.getCharset() might throw IllegalArgumentException,
89+
// IllegalCharsetNameException, or UnsupportedCharsetException.
90+
// All of them are inherited from IllegalArgumentException, so we catch only this class
91+
LOG.debug("Couldn't extract charset from '{}': {}", contentType, ex.getMessage());
92+
return null;
93+
}
94+
}
95+
6596
}

src/test/robotframework/series/import/request-logic.robot

+10
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,16 @@ Import series from external site with catalog numbers (use description locator)
7777
Click Link id:import-request-link
7878
Element Should Be Disabled id:michel-numbers
7979

80+
Import series when a page in a non-utf-8 charset
81+
[Documentation] Verify that a page's charset is respected
82+
Input Text id:url ${MOCK_SERVER}/series/import/request-logic/charset-windows-1251.html
83+
Submit Form id:import-series-form
84+
Element Text Should Be id:request-status ParsingSucceeded
85+
${category}= Get Selected List Label id:category
86+
${country}= Get Selected List Label id:country
87+
Should Be Equal ${category} Prehistoric animals
88+
Should Be Equal ${country} Italy
89+
8090
Import series and series sale with existing seller from an external site
8191
[Documentation] Verify import series and sale (with existing seller)
8292
Input Text id:url ${MOCK_SERVER}/series/import/request-logic/existing-seller.html
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>Series info (with windows-1251 charset)</title>
5+
</head>
6+
<body>
7+
Info: <span id="test-description">Äèíîçàâðû, Èòàëèÿ</span>
8+
</body>
9+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"request": {
3+
"method": "GET",
4+
"url": "/series/import/request-logic/charset-windows-1251.html"
5+
},
6+
"response": {
7+
"status": 200,
8+
"headers": {
9+
"Content-Type": "text/html;charset=windows-1251"
10+
},
11+
"bodyFileName": "series/import/request-logic/charset-windows-1251.html"
12+
}
13+
}

0 commit comments

Comments
 (0)