Skip to content

Commit 066256d

Browse files
committed
refactor(JsoupSiteParser): modify for extracting multiple image URLs
Part of #698
1 parent 3c80632 commit 066256d

File tree

3 files changed

+92
-36
lines changed

3 files changed

+92
-36
lines changed

src/main/java/ru/mystamps/web/feature/series/importing/extractor/JsoupSiteParser.java

+35-8
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,15 @@
2727
import org.jsoup.Jsoup;
2828
import org.jsoup.nodes.Document;
2929
import org.jsoup.nodes.Element;
30+
import org.jsoup.select.Elements;
3031
import org.slf4j.Logger;
3132
import org.slf4j.LoggerFactory;
3233

34+
import java.util.Collections;
35+
import java.util.List;
36+
import java.util.Objects;
37+
import java.util.stream.Collectors;
38+
3339
// Getters/setters/no-arg constructor are being used in unit tests
3440
@Getter(AccessLevel.PROTECTED)
3541
@Setter(AccessLevel.PROTECTED)
@@ -96,7 +102,7 @@ public SeriesInfo parse(String htmlPage) {
96102

97103
info.setCategoryName(extractCategory(body));
98104
info.setCountryName(extractCountry(body));
99-
info.setImageUrl(extractImageUrl(body));
105+
info.setImageUrls(extractImageUrls(body));
100106
info.setIssueDate(extractIssueDate(body));
101107
info.setQuantity(extractQuantity(body));
102108
info.setPerforated(extractPerforated(body));
@@ -141,16 +147,26 @@ protected String extractCountry(Element body) {
141147
return country;
142148
}
143149

144-
protected String extractImageUrl(Element body) {
145-
Element elem = getFirstElement(body, imageUrlLocator);
146-
if (elem == null) {
147-
return null;
150+
protected List<String> extractImageUrls(Element body) {
151+
List<Element> elems = getElements(body, imageUrlLocator);
152+
if (elems.isEmpty()) {
153+
return Collections.emptyList();
148154
}
149155

150156
String attrName = ObjectUtils.firstNonNull(imageUrlAttribute, "href");
151-
String url = elem.absUrl(attrName);
152-
LOG.debug("Extracted image url: '{}'", url);
153-
return StringUtils.trimToNull(url);
157+
158+
List<String> urls = elems
159+
.stream()
160+
.map(elem -> elem.absUrl(attrName))
161+
.map(StringUtils::trimToNull)
162+
.filter(Objects::nonNull)
163+
.collect(Collectors.toList());
164+
if (urls.isEmpty()) {
165+
return Collections.emptyList();
166+
}
167+
168+
LOG.debug("Extracted {} image urls: {}", urls.size(), urls);
169+
return urls;
154170
}
155171

156172
protected String extractIssueDate(Element body) {
@@ -283,6 +299,17 @@ protected String extractCondition(Element body) {
283299
return description;
284300
}
285301

302+
private static List<Element> getElements(Element body, String locator) {
303+
if (locator == null) {
304+
return Collections.emptyList();
305+
}
306+
307+
Elements elems = body.select(locator);
308+
Validate.validState(elems != null, "Element.select(%s) must return non-null", locator);
309+
310+
return elems;
311+
}
312+
286313
private static Element getFirstElement(Element body, String locator) {
287314
if (locator == null) {
288315
return null;

src/main/java/ru/mystamps/web/feature/series/importing/extractor/SeriesInfo.java

+12-2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import lombok.Setter;
2323
import lombok.ToString;
2424

25+
import java.util.List;
26+
2527
/**
2628
* Representation of a series info.
2729
*/
@@ -32,7 +34,7 @@
3234
public class SeriesInfo {
3335
private String categoryName;
3436
private String countryName;
35-
private String imageUrl;
37+
private List<String> imageUrls;
3638
private String issueDate;
3739
private String quantity;
3840
private String perforated;
@@ -44,14 +46,22 @@ public class SeriesInfo {
4446
private String altPrice;
4547
private String altCurrency;
4648
private String condition;
49+
50+
// for backward compatibility
51+
public String getImageUrl() {
52+
if (imageUrls == null || imageUrls.isEmpty()) {
53+
return null;
54+
}
55+
return imageUrls.get(0);
56+
}
4757

4858
/**
4959
* Check whether any info about a series is available.
5060
*/
5161
public boolean isEmpty() {
5262
return categoryName == null
5363
&& countryName == null
54-
&& imageUrl == null
64+
&& (imageUrls == null || imageUrls.isEmpty())
5565
&& issueDate == null
5666
&& quantity == null
5767
&& perforated == null

src/test/java/ru/mystamps/web/feature/series/importing/extractor/JsoupSiteParserTest.java

+45-26
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
import org.junit.jupiter.api.BeforeEach;
2525
import org.junit.jupiter.api.Test;
2626
import ru.mystamps.web.tests.Random;
27+
28+
import java.util.Arrays;
29+
import java.util.Collections;
30+
import java.util.List;
2731
import java.util.Locale;
2832

2933
import static io.qala.datagen.RandomShortApi.nullOr;
@@ -91,7 +95,7 @@ public void parseShouldExtractSeriesInfo() {
9195
expectedInfo.setCategoryName(expectedCategory);
9296
expectedInfo.setCountryName(expectedCountry);
9397
expectedInfo.setIssueDate(expectedIssueDate);
94-
expectedInfo.setImageUrl(expectedImageUrl);
98+
expectedInfo.setImageUrls(Collections.singletonList(expectedImageUrl));
9599
expectedInfo.setSellerName(expectedSellerName);
96100
expectedInfo.setSellerUrl(expectedSellerUrl);
97101
expectedInfo.setPrice(expectedPrice);
@@ -136,14 +140,23 @@ public void parseShouldExtractSeriesInfoFromFirstMatchedElements() {
136140
String expectedCategory = Random.categoryName();
137141
String expectedCountry = Random.countryName();
138142
String expectedIssueDate = Random.issueYear().toString();
139-
String imageUrl = String.format(
140-
"/%s-%s-%s.png",
143+
String firstImageUrl = String.format(
144+
"/%s-%s-%s-1.png",
145+
expectedCountry.toLowerCase(Locale.ENGLISH),
146+
expectedCategory.toLowerCase(Locale.ENGLISH),
147+
expectedIssueDate
148+
);
149+
String secondImageUrl = String.format(
150+
"/%s-%s-%s-2.png",
141151
expectedCountry.toLowerCase(Locale.ENGLISH),
142152
expectedCategory.toLowerCase(Locale.ENGLISH),
143153
expectedIssueDate
144154
);
155+
List<String> expectedImageUrls = Arrays.asList(
156+
baseUri + firstImageUrl,
157+
baseUri + secondImageUrl
158+
);
145159
String sellerUrl = String.format("/seller/%d/info.htm", positiveInteger());
146-
String expectedImageUrl = baseUri + imageUrl;
147160
String expectedSellerName = Random.sellerName();
148161
String expectedSellerUrl = baseUri + sellerUrl;
149162
String expectedPrice = Random.price().toString();
@@ -166,7 +179,8 @@ public void parseShouldExtractSeriesInfoFromFirstMatchedElements() {
166179
expectedInfo.setCategoryName(expectedCategory);
167180
expectedInfo.setCountryName(expectedCountry);
168181
expectedInfo.setIssueDate(expectedIssueDate);
169-
expectedInfo.setImageUrl(expectedImageUrl);
182+
// in case of image URLs, it should find all of them
183+
expectedInfo.setImageUrls(expectedImageUrls);
170184
expectedInfo.setSellerName(expectedSellerName);
171185
expectedInfo.setSellerUrl(expectedSellerUrl);
172186
expectedInfo.setPrice(expectedPrice);
@@ -189,7 +203,7 @@ public void parseShouldExtractSeriesInfoFromFirstMatchedElements() {
189203
+ "<h1>ignored</h1>"
190204
+ "<p>ignored</p>"
191205
+ "<span>ignored</span>"
192-
+ "<a class='image' href='none'>look at image</a>"
206+
+ "<a class='image' href='%s'>look at image</a>"
193207
+ "<a class='seller' href='none'>seller name</a>"
194208
+ "<b>ignored</b>"
195209
+ "<div>ignored</div>"
@@ -200,13 +214,14 @@ public void parseShouldExtractSeriesInfoFromFirstMatchedElements() {
200214
expectedCategory,
201215
expectedCountry,
202216
expectedIssueDate,
203-
expectedImageUrl,
217+
firstImageUrl,
204218
expectedSellerUrl,
205219
expectedSellerName,
206220
expectedPrice,
207221
expectedCurrency,
208222
expectedAltPrice,
209-
expectedAltCurrency
223+
expectedAltCurrency,
224+
secondImageUrl
210225
);
211226

212227
SeriesInfo info = parser.parse(html);
@@ -335,31 +350,31 @@ public void extractCountryShouldReturnTextOfShortDescriptionLocator() {
335350
}
336351

337352
//
338-
// Tests for extractImageUrl()
353+
// Tests for extractImageUrls()
339354
//
340355

341356
@Test
342-
public void extractImageUrlShouldReturnNullWhenLocatorIsNotSet() {
357+
public void extractImageUrlsShouldReturnEmptyResultWhenLocatorIsNotSet() {
343358
parser.setImageUrlLocator(null);
344359
Element doc = createEmptyDocument();
345360

346-
String imageUrl = parser.extractImageUrl(doc);
361+
List<String> imageUrls = parser.extractImageUrls(doc);
347362

348-
assertThat(imageUrl).isNull();
363+
assertThat(imageUrls).isEmpty();
349364
}
350365

351366
@Test
352-
public void extractImageUrlShouldReturnNullWhenElementNotFound() {
367+
public void extractImageUrlsShouldReturnEmptyResultWhenElementNotFound() {
353368
parser.setImageUrlLocator(Random.jsoupLocator());
354369
Element doc = createEmptyDocument();
355370

356-
String imageUrl = parser.extractImageUrl(doc);
371+
List<String> imageUrls = parser.extractImageUrls(doc);
357372

358-
assertThat(imageUrl).isNull();
373+
assertThat(imageUrls).isEmpty();
359374
}
360375

361376
@Test
362-
public void extractImageUrlShouldReturnValueOfImageUrlAttribute() {
377+
public void extractImageUrlsShouldReturnValueOfImageUrlAttribute() {
363378
parser.setImageUrlLocator("a");
364379
parser.setImageUrlAttribute("data-full-path");
365380

@@ -371,14 +386,16 @@ public void extractImageUrlShouldReturnValueOfImageUrlAttribute() {
371386
);
372387
Element doc = createDocumentFromText(html);
373388

374-
String imageUrl = parser.extractImageUrl(doc);
389+
List<String> imageUrls = parser.extractImageUrls(doc);
375390

376-
assertThat(imageUrl).as("couldn't extract image url from '%s'", doc)
377-
.isEqualTo(expectedImageUrl);
391+
assertThat(imageUrls).as("couldn't extract image urls from '%s'", doc)
392+
.hasOnlyOneElementSatisfying(
393+
url -> assertThat(url).isEqualTo(expectedImageUrl)
394+
);
378395
}
379396

380397
@Test
381-
public void extractImageUrlShouldReturnValueOfHrefAttributeByDefault() {
398+
public void extractImageUrlsShouldReturnValueOfHrefAttributeByDefault() {
382399
parser.setImageUrlLocator("a");
383400
parser.setImageUrlAttribute(null);
384401

@@ -390,22 +407,24 @@ public void extractImageUrlShouldReturnValueOfHrefAttributeByDefault() {
390407
);
391408
Element doc = createDocumentFromText(html);
392409

393-
String imageUrl = parser.extractImageUrl(doc);
410+
List<String> imageUrls = parser.extractImageUrls(doc);
394411

395-
assertThat(imageUrl).as("couldn't extract image url from '%s'", doc)
396-
.isEqualTo(expectedImageUrl);
412+
assertThat(imageUrls).as("couldn't extract image urls from '%s'", doc)
413+
.hasOnlyOneElementSatisfying(
414+
url -> assertThat(url).isEqualTo(expectedImageUrl)
415+
);
397416
}
398417

399418
@Test
400-
public void extractImageUrlShouldReturnNullInsteadOfEmptyString() {
419+
public void extractImageUrlsShouldIgnoreEmptyUrls() {
401420
parser.setImageUrlLocator("a");
402421

403422
String html = "<a href=''>test</a>";
404423
Element doc = createDocumentFromText(html);
405424

406-
String imageUrl = parser.extractImageUrl(doc);
425+
List<String> imageUrls = parser.extractImageUrls(doc);
407426

408-
assertThat(imageUrl).isNull();
427+
assertThat(imageUrls).isEmpty();
409428
}
410429

411430
//

0 commit comments

Comments
 (0)