From 17f95fe79bc32d3e6c008716e2bc623cbecf7a0d Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Wed, 31 Jul 2024 06:37:58 +0800 Subject: vim-patch:9.0.0105: illegal memory access when pattern starts with illegal byte Problem: Illegal memory access when pattern starts with illegal byte. Solution: Do not match a character with an illegal byte. https://github.com/vim/vim/commit/f50940531dd57135fe60aa393ac9d3281f352d88 Co-authored-by: Bram Moolenaar --- test/old/testdir/test_regexp_utf8.vim | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'test') diff --git a/test/old/testdir/test_regexp_utf8.vim b/test/old/testdir/test_regexp_utf8.vim index f9ad1fc7ca..07695f9cc8 100644 --- a/test/old/testdir/test_regexp_utf8.vim +++ b/test/old/testdir/test_regexp_utf8.vim @@ -1,5 +1,7 @@ " Tests for regexp in utf8 encoding +source shared.vim + func s:equivalence_test() let str = "AÀÁÂÃÄÅĀĂĄǍǞǠǺȂȦȺḀẠẢẤẦẨẪẬẮẰẲẴẶ BƁɃḂḄḆ CÇĆĈĊČƇȻḈꞒ DĎĐƊḊḌḎḐḒ EÈÉÊËĒĔĖĘĚȄȆȨɆḔḖḘḚḜẸẺẼẾỀỂỄỆ FƑḞꞘ GĜĞĠĢƓǤǦǴḠꞠ HĤĦȞḢḤḦḨḪⱧ IÌÍÎÏĨĪĬĮİƗǏȈȊḬḮỈỊ JĴɈ KĶƘǨḰḲḴⱩꝀ LĹĻĽĿŁȽḶḸḺḼⱠ MḾṀṂ NÑŃŅŇǸṄṆṈṊꞤ OÒÓÔÕÖØŌŎŐƟƠǑǪǬǾȌȎȪȬȮȰṌṎṐṒỌỎỐỒỔỖỘỚỜỞỠỢ PƤṔṖⱣ QɊ RŔŖŘȐȒɌṘṚṜṞⱤꞦ SŚŜŞŠȘṠṢṤṦṨⱾꞨ TŢŤŦƬƮȚȾṪṬṮṰ UÙÚÛÜŨŪŬŮŰƯǕǙǛǓǗȔȖɄṲṴṶṸṺỤỦỨỪỬỮỰ VƲṼṾ WŴẀẂẄẆẈ XẊẌ YÝŶŸƳȲɎẎỲỴỶỸ ZŹŻŽƵẐẒẔⱫ aàáâãäåāăąǎǟǡǻȃȧᶏḁẚạảấầẩẫậắằẳẵặⱥ bƀɓᵬᶀḃḅḇ cçćĉċčƈȼḉꞓꞔ dďđɗᵭᶁᶑḋḍḏḑḓ eèéêëēĕėęěȅȇȩɇᶒḕḗḙḛḝẹẻẽếềểễệ fƒᵮᶂḟꞙ gĝğġģǥǧǵɠᶃḡꞡ hĥħȟḣḥḧḩḫẖⱨꞕ iìíîïĩīĭįǐȉȋɨᶖḭḯỉị jĵǰɉ kķƙǩᶄḱḳḵⱪꝁ lĺļľŀłƚḷḹḻḽⱡ mᵯḿṁṃ nñńņňʼnǹᵰᶇṅṇṉṋꞥ oòóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱɵṍṏṑṓọỏốồổỗộớờởỡợ pƥᵱᵽᶈṕṗ qɋʠ rŕŗřȑȓɍɽᵲᵳᶉṛṝṟꞧ sśŝşšșȿᵴᶊṡṣṥṧṩꞩ tţťŧƫƭțʈᵵṫṭṯṱẗⱦ uùúûüũūŭůűųǚǖưǔǘǜȕȗʉᵾᶙṳṵṷṹṻụủứừửữự vʋᶌṽṿ wŵẁẃẅẇẉẘ xẋẍ yýÿŷƴȳɏẏẙỳỵỷỹ zźżžƶᵶᶎẑẓẕⱬ" let groups = split(str) @@ -559,6 +561,19 @@ func Test_match_invalid_byte() call delete('Xinvalid') endfunc +func Test_match_illegal_byte() + let lines =<< trim END + silent! buffer ÿ\c + next ÿ + 0scriptnames + source + END + call writefile(lines, 'Xregexp') + call system(GetVimCommand() .. ' -X -Z -e -s -S Xregexp -c qa!') + + call delete('Xregexp') +endfunc + func Test_match_too_complicated() set regexpengine=1 exe "noswapfile vsplit \xeb\xdb\x99" -- cgit From e57598fbef1de7c9089a58e3b428b87ba155cd0b Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Wed, 31 Jul 2024 06:06:09 +0800 Subject: vim-patch:9.1.0645: regex: wrong match when searching multi-byte char case-insensitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: regex: wrong match when searching multi-byte char case-insensitive (diffsetter) Solution: Apply proper case-folding for characters and search-string This patch does the following 4 things: 1) When the regexp engine compares two utf-8 codepoints case insensitive it may match an adjacent character, because it assumes it can step over as many bytes as the pattern contains. This however is not necessarily true because of case-folding, a multi-byte UTF-8 character can be considered equal to some single-byte value. Let's consider the pattern 'ſ' and the string 's'. When comparing and ignoring case, the single character 's' matches, and since it matches Vim will try to step over the match (by the amount of bytes of the pattern), assuming that since it matches, the length of both strings is the same. However in that case, it should only step over the single byte value 's' by 1 byte and try to start matching after it again. So for the backtracking engine we need to ensure: * we try to match the correct length for the pattern and the text * in case of a match, we step over it correctly There is one tricky thing for the backtracing engine. We also need to calculate correctly the number of bytes to compare the 2 different utf-8 strings s1 and s2. So we will count the number of characters in s1 that the byte len specified. Then we count the number of bytes to step over the same number of characters in string s2 and then we can correctly compare the 2 utf-8 strings. 2) A similar thing can happen for the NFA engine, when skipping to the next character to test for a match. We are skipping over the regstart pointer, however we do not consider the case that because of case-folding we may need to adjust the number of bytes to skip over. So this needs to be adjusted in find_match_text() as well. 3) A related issue turned out, when prog->match_text is actually empty. In that case we should try to find the next match and skip this condition. 4) When comparing characters using collections, we must also apply case folding to each character in the collection and not just to the current character from the search string. This doesn't apply to the NFA engine, because internally it converts collections to branches [abc] -> a\|b\|c fixes: vim/vim#14294 closes: vim/vim#14756 https://github.com/vim/vim/commit/22e8e12d9f5034e1984db0c567b281fda4de8dd7 N/A patches: vim-patch:9.0.1771: regex: combining chars in collections not handled vim-patch:9.0.1777: patch 9.0.1771 causes problems Co-authored-by: Christian Brabandt --- test/old/testdir/test_regexp_utf8.vim | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'test') diff --git a/test/old/testdir/test_regexp_utf8.vim b/test/old/testdir/test_regexp_utf8.vim index 07695f9cc8..505e99919c 100644 --- a/test/old/testdir/test_regexp_utf8.vim +++ b/test/old/testdir/test_regexp_utf8.vim @@ -594,4 +594,36 @@ func Test_combining_chars_in_collection() bw! endfunc +func Test_search_multibyte_match_ascii() + new + " Match single 'ſ' and 's' + call setline(1, 'das abc heraus abc ſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + endfor + " Match several 'ſſ' and 'ss' + call setline(1, 'das abc herauss abc ſſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text}) + + call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re) + endfor + bw! +endfunc + " vim: shiftwidth=2 sts=2 expandtab -- cgit