Skip to content

Commit 52be3cc

Browse files
committed
Series import: filter out invalid candidates.
Fix #821
1 parent 655d658 commit 52be3cc

File tree

2 files changed

+48
-6
lines changed

2 files changed

+48
-6
lines changed

src/main/java/ru/mystamps/web/service/SeriesInfoExtractorServiceImpl.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.util.Arrays;
2121
import java.util.Collections;
2222
import java.util.List;
23+
import java.util.function.Predicate;
2324
import java.util.regex.Matcher;
2425
import java.util.regex.Pattern;
2526
import java.util.stream.Collectors;
@@ -34,8 +35,15 @@
3435

3536
import ru.mystamps.web.service.dto.RawParsedDataDto;
3637
import ru.mystamps.web.service.dto.SeriesExtractedInfo;
38+
import ru.mystamps.web.validation.ValidationRules;
3739

3840
@RequiredArgsConstructor
41+
@SuppressWarnings({
42+
// predicate names in camel case more readable than in uppercase
43+
"PMD.VariableNamingConventions", "checkstyle:constantname",
44+
// these "||" on the same line because it's more readable
45+
"checkstyle:operatorwrap"
46+
})
3947
public class SeriesInfoExtractorServiceImpl implements SeriesInfoExtractorService {
4048

4149
// Related to RELEASE_YEAR_REGEXP and used in unit tests.
@@ -51,6 +59,28 @@ public class SeriesInfoExtractorServiceImpl implements SeriesInfoExtractorServic
5159
Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE
5260
);
5361

62+
// CheckStyle: ignore LineLength for next 9 lines
63+
private static final Pattern VALID_CATEGORY_NAME_EN = Pattern.compile(ValidationRules.CATEGORY_NAME_EN_REGEXP);
64+
private static final Pattern VALID_CATEGORY_NAME_RU = Pattern.compile(ValidationRules.CATEGORY_NAME_RU_REGEXP);
65+
private static final Pattern VALID_COUNTRY_NAME_EN = Pattern.compile(ValidationRules.COUNTRY_NAME_EN_REGEXP);
66+
private static final Pattern VALID_COUNTRY_NAME_RU = Pattern.compile(ValidationRules.COUNTRY_NAME_RU_REGEXP);
67+
68+
private static final Predicate<String> tooShortCategoryName = name -> name.length() >= ValidationRules.CATEGORY_NAME_MIN_LENGTH;
69+
private static final Predicate<String> tooLongCategoryName = name -> name.length() <= ValidationRules.CATEGORY_NAME_MAX_LENGTH;
70+
private static final Predicate<String> tooShortCountryName = name -> name.length() >= ValidationRules.COUNTRY_NAME_MIN_LENGTH;
71+
private static final Predicate<String> tooLongCountryName = name -> name.length() <= ValidationRules.COUNTRY_NAME_MAX_LENGTH;
72+
73+
private static final Predicate<String> invalidCategoryName = name ->
74+
VALID_CATEGORY_NAME_EN.matcher(name).matches() ||
75+
VALID_CATEGORY_NAME_RU.matcher(name).matches();
76+
77+
private static final Predicate<String> invalidCountryName = name ->
78+
VALID_COUNTRY_NAME_EN.matcher(name).matches() ||
79+
VALID_COUNTRY_NAME_RU.matcher(name).matches();
80+
81+
// Max number of candidates that will be used in the SQL query within IN() statement.
82+
private static final long MAX_CANDIDATES_FOR_LOOKUP = 50;
83+
5484
private final Logger log;
5585
private final CategoryService categoryService;
5686
private final CountryService countryService;
@@ -74,6 +104,8 @@ public SeriesExtractedInfo extract(RawParsedDataDto data) {
74104
);
75105
}
76106

107+
// CheckStyle: ignore LineLength for next 1 line
108+
// @todo #821 SeriesInfoExtractorServiceImpl.extractCategory(): add unit tests for filtering invalid names
77109
protected List<Integer> extractCategory(String fragment) {
78110
if (StringUtils.isBlank(fragment)) {
79111
return Collections.emptyList();
@@ -83,7 +115,11 @@ protected List<Integer> extractCategory(String fragment) {
83115

84116
String[] candidates = StringUtils.split(fragment, "\n\t ,");
85117
List<String> uniqueCandidates = Arrays.stream(candidates)
118+
.filter(tooShortCategoryName)
119+
.filter(tooLongCategoryName)
120+
.filter(invalidCategoryName)
86121
.distinct()
122+
.limit(MAX_CANDIDATES_FOR_LOOKUP)
87123
.collect(Collectors.toList());
88124

89125
log.debug("Possible candidates: {}", uniqueCandidates);
@@ -108,6 +144,8 @@ protected List<Integer> extractCategory(String fragment) {
108144
return Collections.emptyList();
109145
}
110146

147+
// CheckStyle: ignore LineLength for next 1 line
148+
// @todo #821 SeriesInfoExtractorServiceImpl.extractCountry(): add unit tests for filtering invalid names
111149
protected List<Integer> extractCountry(String fragment) {
112150
if (StringUtils.isBlank(fragment)) {
113151
return Collections.emptyList();
@@ -117,7 +155,11 @@ protected List<Integer> extractCountry(String fragment) {
117155

118156
String[] candidates = StringUtils.split(fragment, "\n\t ,");
119157
List<String> uniqueCandidates = Arrays.stream(candidates)
158+
.filter(tooShortCountryName)
159+
.filter(tooLongCountryName)
160+
.filter(invalidCountryName)
120161
.distinct()
162+
.limit(MAX_CANDIDATES_FOR_LOOKUP)
121163
.collect(Collectors.toList());
122164

123165
log.debug("Possible candidates: {}", uniqueCandidates);

src/test/groovy/ru/mystamps/web/service/SeriesInfoExtractorServiceImplTest.groovy

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,16 @@ class SeriesInfoExtractorServiceImplTest extends Specification {
8585
given:
8686
List<Integer> expectedResult = Random.listOfIntegers()
8787
when:
88-
List<Integer> result = service.extractCategory('foo1 foo2')
88+
List<Integer> result = service.extractCategory('Sport Space')
8989
then:
9090
// in order to search by prefix, we shouldn't find anything by name
9191
1 * categoryService.findIdsByNames(_ as List<String>) >> Collections.emptyList()
9292
and:
9393
// the first lookup will find nothing
94-
1 * categoryService.findIdsWhenNameStartsWith('foo1') >> Collections.emptyList()
94+
1 * categoryService.findIdsWhenNameStartsWith('Sport') >> Collections.emptyList()
9595
and:
9696
// the second lookup will return a result
97-
1 * categoryService.findIdsWhenNameStartsWith('foo2') >> expectedResult
97+
1 * categoryService.findIdsWhenNameStartsWith('Space') >> expectedResult
9898
and:
9999
result == expectedResult
100100
}
@@ -151,16 +151,16 @@ class SeriesInfoExtractorServiceImplTest extends Specification {
151151
given:
152152
List<Integer> expectedResult = Random.listOfIntegers()
153153
when:
154-
List<Integer> result = service.extractCountry('bar1 bar2')
154+
List<Integer> result = service.extractCountry('Sweden Norway')
155155
then:
156156
// in order to search by prefix, we shouldn't find anything by name
157157
1 * countryService.findIdsByNames(_ as List<String>) >> Collections.emptyList()
158158
and:
159159
// the first lookup will find nothing
160-
1 * countryService.findIdsWhenNameStartsWith('bar1') >> Collections.emptyList()
160+
1 * countryService.findIdsWhenNameStartsWith('Sweden') >> Collections.emptyList()
161161
and:
162162
// the second lookup will return a result
163-
1 * countryService.findIdsWhenNameStartsWith('bar2') >> expectedResult
163+
1 * countryService.findIdsWhenNameStartsWith('Norway') >> expectedResult
164164
and:
165165
result == expectedResult
166166
}

0 commit comments

Comments
 (0)