Skip to content

improve jd parser #349

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion src/common/helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -1903,6 +1903,56 @@ async function getMemberGroups (userId) {
return _.get(res, 'body')
}

/**
* Removes markdown and html formatting from given text
*
* @param {String} text formatted text
* @returns {String} cleaned words seperated by single space
*/
function removeTextFormatting (text) {
text = _.replace(text, /^(-\s*?|\*\s*?|_\s*?){3,}\s*$/gm, ' ')
text = _.replace(text, /^([\s\t]*)([*\-+]|\d+\.)\s+/gm, ' $1 ')
// Header
text = _.replace(text, /\n={2,}/g, '\n')
// Fenced codeblocks
text = _.replace(text, /~{3}.*\n/g, ' ')
// Strikethrough
text = _.replace(text, /~~/g, ' ')
// Fenced codeblocks
text = _.replace(text, /`{3}.*\n/g, ' ')
// Remove HTML tags
text = _.replace(text, /<[^>]*>/g, ' ')
// Remove setext-style headers
text = _.replace(text, /^[=-]{2,}\s*$/g, ' ')
// Remove footnotes
text = _.replace(text, /\[\^.+?\](: .*?$)?/g, ' ')
text = _.replace(text, /\s{0,2}\[.*?\]: .*?$/g, ' ')
// Remove images
text = _.replace(text, /!\[(.*?)\][[(].*?[\])]/g, ' $1 ')
// Remove inline links
text = _.replace(text, /\[(.*?)\][[(].*?[\])]/g, ' $1 ')
// Remove blockquotes
text = _.replace(text, /^\s{0,3}>\s?/g, ' ')
// Remove reference-style links
text = _.replace(text, /^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/g, ' ')
// Remove atx-style headers
text = _.replace(text, /^#{1,6}\s*([^#]*)\s*#{1,6}?$/gm, ' $1 ')
// Remove emphasis (repeat the line to remove double emphasis)
text = _.replace(text, /([*_]{1,3})(\S.*?\S{0,1})\1/g, ' $2 ')
text = _.replace(text, /([*_]{1,3})(\S.*?\S{0,1})\1/g, ' $2 ')
// Remove code blocks
text = _.replace(text, /(`{3,})(.*?)\1/gm, ' $2 ')
// Remove inline code
text = _.replace(text, /`(.+?)`/g, ' $1 ')
// Remove punctuation
text = _.replace(text, /[,"'?/\\]/g, ' ')
// Replace two or more newlines
text = _.replace(text, /\n/g, ' ')
// replace all whitespace characters with single space
text = _.replace(text, /\s\s+/g, ' ')
return text
}

module.exports = {
getParamFromCliArgs,
promptUser,
Expand Down Expand Up @@ -1961,5 +2011,6 @@ module.exports = {
getUserByHandle,
substituteStringByObject,
createProject,
getMemberGroups
getMemberGroups,
removeTextFormatting
}
10 changes: 7 additions & 3 deletions src/services/TeamService.js
Original file line number Diff line number Diff line change
Expand Up @@ -834,13 +834,17 @@ async function getSkillsByJobDescription (currentUser, data) {
// unnecessary api calls which is extremely time comsuming.
await _reloadCachedTopcoderSkills()
// replace markdown tags with spaces
let description = _.replace(data.description, /[`|^[\]{}~/,:-]|#{2,}|<br>/gi, ' ')
// replace all whitespace characters with single space
description = _.replace(description, /\s\s+/g, ' ')
const description = helper.removeTextFormatting(data.description)
// extract words from description
let words = _.split(description, ' ')
// remove stopwords from description
words = _.filter(words, word => stopWords.indexOf(word.toLowerCase()) === -1)
// include consecutive two word combinations
const twoWords = []
for (let i = 0; i < words.length - 1; i++) {
twoWords.push(`${words[i]} ${words[i + 1]}`)
}
words = _.concat(words, twoWords)
let foundSkills = []
const result = []
// try to match each word with skill names
Expand Down