在上一个问题(需要链接)中,我尝试删除Twitter上的匆忙标记和评论。我的字符串是这样的文本:
@lien_ayy92 ??% Real ?Avail▶#Jakarta #Bekasi ?Excl/Incl ?Expo▶6-7 Juli #Cirebon ?Wajib DP? ?Cek BIO? https://local.com/我想试着删除不重要的图标和文本。
我的代码来自前面的代码
let clean = function (data) {
data = data.replace(/(?:https?|ftp):\/\/[\n\S]+|\B[@#]\w+\b|\b\w+[@#]\B|\B[^\w\s]{2,}\B|\b[a-zA-Z]{1,3}\b|[0-9]+|[$&+,:;=?@#|'<>.^*()%!-/]|\ud83d[\ude00-\ude4f]/g, '');
return data;
}
let stopwords = function (docs) {
docs = clean(docs);
docs = docs.trim();
docs = docs.toLowerCase();
docs = docs.split(' ');
let wordsstop = ['about'];
let docs1 = new Array;
var x = 0;
for(let i = 0; i < docs.length; i++){
if(wordsstop.indexOf(docs[i]) !== -1 || docs[i] == ""){
}else{
docs1[x] = docs[i]
x++;
}
}
return docs1;
}
console.log(stopwords('?@lien_ayy92 ??% Real ?Avail▶#Jakarta #Bekasi ?Excl/Incl ?Expo▶6-7 Juli #Cirebon ?Wajib DP? ?Cek BIO? https://local.com about data'));
我想要这样的结果:
["real","juli","data"];发布于 2017-07-04 15:56:32
您可以使用Array.prototype.filter删除所有带有特殊字符的项。
let clean = function (data) {
data = data.replace(/(?:https?|ftp):\/\/[\n\S]+|\B[@#]\w+\b|\b\w+[@#]\B|\B[^\w\s]{2,}\B|\b[a-zA-Z]{1,3}\b|[0-9]+|[$&+,:;=?@#|'<>.^*()%!-/]|\ud83d[\ude00-\ude4f]/g, '');
return data;
}
let stopwords = function (docs) {
docs = clean(docs);
docs = docs.trim();
docs = docs.toLowerCase();
docs = docs.split(' ');
let wordsstop = ['about'];
let docs1 = new Array;
var x = 0;
for(let i = 0; i < docs.length; i++){
if(wordsstop.indexOf(docs[i]) !== -1 || docs[i] == ""){
}else{
docs1[x] = docs[i]
x++;
}
}
// filter code below
var resultDocs = docs1.filter(function(data) {
var tmp = data.replace(/[a-zA-Z$&+,:;=?@#|'<>.^*()%!-/]/g, '');
if (tmp.length === 0) {
return true;
}
});
return resultDocs;
}
console.log(stopwords('?@lien_ayy92 ??% Real ?Avail▶#Jakarta #Bekasi ?Excl/Incl ?Expo▶6-7 Juli #Cirebon ?Wajib DP? ?Cek BIO? https://local.com about data'));
https://stackoverflow.com/questions/44899885
复制相似问题