文章/答案/技术大牛

发布

社区首页 >问答首页 >将文本分割成等长字符串，保持单词完整

问将文本分割成等长字符串，保持单词完整
EN

Stack Overflow用户

提问于 2015-12-26 20:08:28

回答 3查看 2K关注 0票数 7

我有这样的代码，它将较长的行分解成等长字符串数组--保留单词--它还考虑到像[[u;#fff;]some text]这样的格式，它分割文本，以便每个字符串可以独立地转换为html：

var format_re = /\[\[([!gbiuso]*;[^;\]]*;[^;\]]*(?:;|[^\]()]*);?[^\]]*)\]([^\]]*\\\][^\]]*|[^\]]*|[^\[]*\[[^\]]*)\]?/gi;
var format_begin_re = /(\[\[[!gbiuso]*;[^;]*;[^\]]*\])/i;
var format_last_re = /\[\[[!gbiuso]*;[^;]*;[^\]]*\]?$/i;
$.terminal.split_equal = function(str, length, words) {
  var formatting = false;
  var in_text = false;
  var prev_format = '';
  var result = [];
  // add format text as 5th paramter to formatting it's used for
  // data attribute in format function
  var array = str.replace(format_re, function(_, format, text) {
    var semicolons = format.match(/;/g).length;
    // missing semicolons
    if (semicolons == 2) {
      semicolons = ';;';
    } else if (semicolons == 3) {
      semicolons = ';';
    } else {
      semicolons = '';
    }
    // return '[[' + format + ']' + text + ']';
    // closing braket will break formatting so we need to escape
    // those using html entity equvalent
    return '[[' + format + semicolons +
      text.replace(/\\\]/g, '&#93;').replace(/\n/g, '\\n') + ']' +
      text + ']';
  }).split(/\n/g);
  for (var i = 0, len = array.length; i < len; ++i) {
    if (array[i] === '') {
      result.push('');
      continue;
    }
    var line = array[i];
    var first_index = 0;
    var count = 0;
    var space = -1;
    for (var j=0, jlen=line.length; j<jlen; ++j) {
      if (line[j] === '[' && line[j+1] === '[') {
        formatting = true;
      } else if (formatting && line[j] === ']') {
        if (in_text) {
          formatting = false;
          in_text = false;
        } else {
          in_text = true;
        }
      } else if ((formatting && in_text) || !formatting) {
        if (line[j] === '&') { // treat entity as one character
          var m = line.substring(j).match(/^(&[^;]+;)/);
          if (!m) {
            // should never happen if used by terminal,
            // because it always calls $.terminal.encode
            // before this function
            throw new Error("Unclosed html entity in line " +
                            (i+1) + ' at char ' + (j+1));
          }
          j+=m[1].length-2; // because continue adds 1 to j
          // if entity is at the end there is no next loop
          // issue #77
          if (j === jlen-1) {
            result.push(output + m[1]);
          }
          continue;
        } else if (line[j] === ']' && line[j-1] === '\\') {
          // escape \] counts as one character
          --count;
        } else {
          ++count;
        }
      }
      function is_space() {
        return line.substring(j-6, j) == '&nbsp;' ||
          line.substring(j-1, j) == ' ';
      }
      if (is_space() && ((formatting && in_text) || !formatting)) {
        space = j;
      }
      if ((count === length || j === jlen-1) &&
          ((formatting && in_text) || !formatting)) {
        var output;
        var after = line.substring(space, j+length+1);
        var text = $('<span>' + after + '</span>').text();
        var can_break = text.match(/\s/);
        if (words && space != -1 && j !== jlen-1 && can_break) {
          // get text to last space
          output = line.substring(first_index, space);
          j = space-1;
          space = -1;
        } else {
          output = line.substring(first_index, j+1);
        }
        if (words) {
          output = output.replace(/^(&nbsp;|\s)+|(&nbsp;|\s)+$/g, '');
        }
        first_index = j+1;
        count = 0;
        if (prev_format) {
          output = prev_format + output;
          if (output.match(']')) {
            prev_format = '';
          }
        }
        // Fix output if formatting not closed
        var matched = output.match(format_re);
        if (matched) {
          var last = matched[matched.length-1];
          if (last[last.length-1] !== ']') {
            prev_format = last.match(format_begin_re)[1];
            output += ']';
          } else if (output.match(format_last_re)) {
            var line_len = output.length;
            // why this line ???
            //var f_len = line_len-last[last.length-1].length;
            output = output.replace(format_last_re, '');
            prev_format = last.match(format_begin_re)[1];
          }
        }
        result.push(output);
      }
    }
  }
  return result;
};

它几乎正常工作，但有些线条比它应该喜欢的更短：

is cracker.The term

在此小提琴中，当您取消格式设置，选中复选框时，它就会正常工作。我工作了几个小时，不知道为什么这条线更短，任何帮助都会非常感谢。

javascript

jquery

回答 3

Stack Overflow用户

回答已采纳

发布于 2015-12-28 21:19:50

下面是如何修复原始代码：

在第40行之后添加以下内容：

in_text = false;

代码使用in_text标志来确定当前位置是否处于常规文本中。但是，当它进入格式化标记区域时，它没有清除标志。这就是问题中用超短线描述的主要问题的原因所在.

将第76/77行的if语句更改为：

if (is_space() && ((formatting && in_text) || !formatting || (line[j] === '[' && line[j+1] === '['))) {

这解决了一个较小的问题，在常规文本和格式化文本之间的空格上没有出现换行现象。

这里的工作小提琴：https://jsfiddle.net/2w10xp3m/1/

票数 5

Stack Overflow用户

发布于 2015-12-28 21:16:43

我想我已经用一种简单得多的方法解决了这个问题。首先拆分所有单词，然后重新组装行，同时跟踪当前格式。见JsFiddle。

JavaScript

$.terminal.split_equal = function(str, length, words) {
  var result = [],
    currentFormat = null,
    currentLine = '',
    currentLineLengthWithoutFormatting = 0;

  // 1. Split words on &nbsp;
  words = str.split(/&nbsp;/g);

  // 2. Re-assemble lines while keeping track of current formats
  words.forEach(function(word) {
    // Keep track of current format
    var format = word.match(/^\[\[([^\]]+)\]/g),
      wordWithFormatting, wordLength;
    if (format !== null && format[0]) {
      currentFormat = format[0];
      word = word.slice(format[0].length);
    }
    // Apply current format to each word separatly
    wordLength = word.length;
    wordWithFormatting = (currentFormat || '') + word;
    if (currentFormat) {
      if (word.indexOf(']') !== -1) {
        wordLength--;
        currentFormat = null;
      } else {
        wordWithFormatting += ']';
      }
    }
    // Assemble line
    if (currentLineLengthWithoutFormatting + wordLength <= length) {
      // Word still fits on current line
      if (currentLineLengthWithoutFormatting > 0) {
        currentLine += ' ';
        currentLineLengthWithoutFormatting++;
      }
    } else {
      // Need to start new line
      result.push(currentLine);
      currentLine = '';
      currentLineLengthWithoutFormatting = 0;
    }

    currentLine += wordWithFormatting;
    currentLineLengthWithoutFormatting += wordLength;
  });

  if (currentLineLengthWithoutFormatting > 0)
    result.push(currentLine);

  return result;
};

票数 4

Stack Overflow用户

发布于 2019-07-18 20:51:18

npm包段落建设者将连续的文本分割成所谓的段落，这些段落分布均匀，字数大致相同。这个段落的概念似乎是你所要寻找的。

您可以定义段落的字数。您可以将段落原则扩展到页面，考虑到页面平均包含的字符数和空格大致相同。

此段落生成器节点脚本从连续文本生成段落。它输出一个文本，其中每个段落的大小大致相同，在文本中提供了均匀的段落分布。它不会对"1.2“这样的数字进行拆分。

有一个选项可以定义段落之间的中断字符，或者可以将段落提取到字符串数组中，您可以从中应用html标记<p>。检查其文件以获得进一步的澄清。

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/34474825

复制

相似问题

问将文本分割成等长字符串，保持单词完整
EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问将文本分割成等长字符串，保持单词完整EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问将文本分割成等长字符串，保持单词完整
EN