diff --git a/docdiff.rb b/docdiff.rb old mode 100644 new mode 100755 diff --git a/docdiff/charstring.rb b/docdiff/charstring.rb index 1673f84..9beac1e 100644 --- a/docdiff/charstring.rb +++ b/docdiff/charstring.rb @@ -13,9 +13,20 @@ module CharString # @encoding = CharString.guess_encoding(string) # @eol = CharString.guess_eol(string) =end unnecessary + @fullstring = string super end + # Fullstring includes non-graphical characters like newlines and spaces. + # These are (optionally) kept separate, because users comparing documents + # don't care about whitespace changes. + attr_reader :fullstring + attr_writer :fullstring + + def to_s() + @fullstring + end + def encoding() @encoding # if @encoding @@ -289,7 +300,8 @@ module CharString count_latin_blank_char() + count_ja_blank_char() end - def split_to_word() + # splits a string into words, treating whitespaces as words. + def split_to_word_verbose() raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding] # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol] scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC, @@ -298,6 +310,21 @@ module CharString ) end + # splits a string into words, storing whitespaces in an attribute of + # the preceeding word. + def split_to_word() + words = split_to_word_verbose() + word_regex = Regexp.new(Encodings[encoding]::PURE_WORD_REGEXP_SRC, + Regexp::MULTILINE, + encoding.sub(/ASCII/i, 'none')) + result = words.collect { |word| + word_text = word.scan(word_regex).select{|x| x != ""}[0] + word_text.extend CharString + word_text.fullstring = word + word_text + } + end + def count_word() split_to_word().size end diff --git a/docdiff/encoding/en_ascii.rb b/docdiff/encoding/en_ascii.rb index 6bab6ae..96f760e 100644 --- a/docdiff/encoding/en_ascii.rb +++ b/docdiff/encoding/en_ascii.rb @@ -67,9 +67,10 @@ module CharString PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape" GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape" - WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)", - "|(?:[#{SPACE}]+)", - "|(?:.+?)"].join + WORD_REGEXP_SRC = ["(?:[#{PUNCT}]+[#{SPACE}]*)", + "|(?:[#{ALNUM}]+[#{SPACE}]*)", + "|(?:[^#{GRAPH}]+)"].join + PURE_WORD_REGEXP_SRC = "[#{GRAPH}]+" # override default method, as ASCII has no Japanese in it def count_ja_graph_char() diff --git a/docdiff/view.rb b/docdiff/view.rb index 6535090..e15282b 100644 --- a/docdiff/view.rb +++ b/docdiff/view.rb @@ -50,20 +50,27 @@ class View str.gsub(tags[:outside_escape_pat]){|m| tags[:outside_escape_dic][m]} end + def format_block(block) + if block == nil + "" + else + block.collect {|x| x.to_s}.to_s + end + end + def apply_style(tags, headfoot = true) result = [] @difference.each{|block| operation = block.first + source = format_block(block[1]) + target = format_block(block[2]) if block_given? - source = yield block[1].to_s - target = yield block[2].to_s - else - source = block[1].to_s - target = block[2].to_s + source = yield source + target = yield target end case operation when :common_elt_elt - result << (tags[:start_common] + escape_outside(source, tags) + tags[:end_common]) + result << (tags[:start_common] + escape_outside(target, tags) + tags[:end_common]) when :change_elt result << (tags[:start_before_change] + escape_inside(source, tags) + tags[:end_before_change] + tags[:start_after_change] + escape_inside(target, tags) + tags[:end_after_change])