Skip to content

Commit 8249acc

Browse files
authored
Merge pull request #349 from eisbilir/improve-jd-parser
improve jd parser to clean the formatting up
2 parents e66100c + a5cba61 commit 8249acc

File tree

2 files changed

+59
-4
lines changed

2 files changed

+59
-4
lines changed

src/common/helper.js

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1903,6 +1903,56 @@ async function getMemberGroups (userId) {
19031903
return _.get(res, 'body')
19041904
}
19051905

1906+
/**
1907+
* Removes markdown and html formatting from given text
1908+
*
1909+
* @param {String} text formatted text
1910+
* @returns {String} cleaned words seperated by single space
1911+
*/
1912+
function removeTextFormatting (text) {
1913+
text = _.replace(text, /^(-\s*?|\*\s*?|_\s*?){3,}\s*$/gm, ' ')
1914+
text = _.replace(text, /^([\s\t]*)([*\-+]|\d+\.)\s+/gm, ' $1 ')
1915+
// Header
1916+
text = _.replace(text, /\n={2,}/g, '\n')
1917+
// Fenced codeblocks
1918+
text = _.replace(text, /~{3}.*\n/g, ' ')
1919+
// Strikethrough
1920+
text = _.replace(text, /~~/g, ' ')
1921+
// Fenced codeblocks
1922+
text = _.replace(text, /`{3}.*\n/g, ' ')
1923+
// Remove HTML tags
1924+
text = _.replace(text, /<[^>]*>/g, ' ')
1925+
// Remove setext-style headers
1926+
text = _.replace(text, /^[=-]{2,}\s*$/g, ' ')
1927+
// Remove footnotes
1928+
text = _.replace(text, /\[\^.+?\](: .*?$)?/g, ' ')
1929+
text = _.replace(text, /\s{0,2}\[.*?\]: .*?$/g, ' ')
1930+
// Remove images
1931+
text = _.replace(text, /!\[(.*?)\][[(].*?[\])]/g, ' $1 ')
1932+
// Remove inline links
1933+
text = _.replace(text, /\[(.*?)\][[(].*?[\])]/g, ' $1 ')
1934+
// Remove blockquotes
1935+
text = _.replace(text, /^\s{0,3}>\s?/g, ' ')
1936+
// Remove reference-style links
1937+
text = _.replace(text, /^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/g, ' ')
1938+
// Remove atx-style headers
1939+
text = _.replace(text, /^#{1,6}\s*([^#]*)\s*#{1,6}?$/gm, ' $1 ')
1940+
// Remove emphasis (repeat the line to remove double emphasis)
1941+
text = _.replace(text, /([*_]{1,3})(\S.*?\S{0,1})\1/g, ' $2 ')
1942+
text = _.replace(text, /([*_]{1,3})(\S.*?\S{0,1})\1/g, ' $2 ')
1943+
// Remove code blocks
1944+
text = _.replace(text, /(`{3,})(.*?)\1/gm, ' $2 ')
1945+
// Remove inline code
1946+
text = _.replace(text, /`(.+?)`/g, ' $1 ')
1947+
// Remove punctuation
1948+
text = _.replace(text, /[,"'?/\\]/g, ' ')
1949+
// Replace two or more newlines
1950+
text = _.replace(text, /\n/g, ' ')
1951+
// replace all whitespace characters with single space
1952+
text = _.replace(text, /\s\s+/g, ' ')
1953+
return text
1954+
}
1955+
19061956
module.exports = {
19071957
getParamFromCliArgs,
19081958
promptUser,
@@ -1961,5 +2011,6 @@ module.exports = {
19612011
getUserByHandle,
19622012
substituteStringByObject,
19632013
createProject,
1964-
getMemberGroups
2014+
getMemberGroups,
2015+
removeTextFormatting
19652016
}

src/services/TeamService.js

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -834,13 +834,17 @@ async function getSkillsByJobDescription (currentUser, data) {
834834
// unnecessary api calls which is extremely time comsuming.
835835
await _reloadCachedTopcoderSkills()
836836
// replace markdown tags with spaces
837-
let description = _.replace(data.description, /[`|^[\]{}~/,:-]|#{2,}|<br>/gi, ' ')
838-
// replace all whitespace characters with single space
839-
description = _.replace(description, /\s\s+/g, ' ')
837+
const description = helper.removeTextFormatting(data.description)
840838
// extract words from description
841839
let words = _.split(description, ' ')
842840
// remove stopwords from description
843841
words = _.filter(words, word => stopWords.indexOf(word.toLowerCase()) === -1)
842+
// include consecutive two word combinations
843+
const twoWords = []
844+
for (let i = 0; i < words.length - 1; i++) {
845+
twoWords.push(`${words[i]} ${words[i + 1]}`)
846+
}
847+
words = _.concat(words, twoWords)
844848
let foundSkills = []
845849
const result = []
846850
// try to match each word with skill names

0 commit comments

Comments
 (0)