Skip to content
This repository has been archived by the owner on Aug 5, 2024. It is now read-only.

Stop breaking surrogate pairs in toDelta()/fromDelta() #80

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 141 additions & 8 deletions java/src/name/fraser/neil/plaintext/diff_match_patch.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package name.fraser.neil.plaintext;

import java.io.UnsupportedEncodingException;
import java.lang.Character;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.*;
Expand Down Expand Up @@ -1293,6 +1294,46 @@ public void diff_cleanupMerge(LinkedList<Diff> diffs) {
}
}

/**
* Rearrange diff boudnaries that split Unicode surrogate pairs.
* @param diffs Linked list of diff objects
*/
public void diff_cleanupSplitSurrogates(List<Diff> diffs) {
char lastEnd = 0;
boolean isFirst = true;
HashSet<Diff> toRemove = new HashSet<Diff>();

for (Diff aDiff : diffs) {
if (aDiff.text.isEmpty()) {
toRemove.add(aDiff);
continue;
}

char thisTop = aDiff.text.charAt(0);
char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1);

if (Character.isHighSurrogate(thisEnd)) {
lastEnd = thisEnd;
aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1);
}

if (!isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) {
aDiff.text = lastEnd + aDiff.text;
}

isFirst = false;

if ( aDiff.text.isEmpty() ) {
toRemove.add(aDiff);
continue;
}
}

for (Diff aDiff : toRemove) {
diffs.remove(aDiff);
}
}

/**
* loc is a location in text1, compute and return the equivalent location in
* text2.
Expand Down Expand Up @@ -1429,6 +1470,7 @@ public int diff_levenshtein(List<Diff> diffs) {
*/
public String diff_toDelta(List<Diff> diffs) {
StringBuilder text = new StringBuilder();
this.diff_cleanupSplitSurrogates(diffs);
for (Diff aDiff : diffs) {
switch (aDiff.operation) {
case INSERT:
Expand Down Expand Up @@ -1457,6 +1499,103 @@ public String diff_toDelta(List<Diff> diffs) {
return delta;
}

private int digit16(char b) throws IllegalArgumentException {
switch (b) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case 'A': case 'a': return 10;
case 'B': case 'b': return 11;
case 'C': case 'c': return 12;
case 'D': case 'd': return 13;
case 'E': case 'e': return 14;
case 'F': case 'f': return 15;
default:
throw new IllegalArgumentException();
}
}

private String decodeURI(String text) throws IllegalArgumentException {
int i = 0;
StringBuilder decoded = new StringBuilder(text.length());

while (i < text.length()) {
if (text.charAt(i) != '%') {
decoded.append(text.charAt(i++));
continue;
}

// start a percent-sequence
int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2));
if ((byte1 & 0x80) == 0) {
decoded.append(Character.toChars(byte1));
i += 3;
continue;
}

if ( text.charAt(i + 3) != '%') {
throw new IllegalArgumentException();
}

int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5));
if ((byte2 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
byte2 = byte2 & 0x3F;
if ((byte1 & 0xE0) == 0xC0) {
decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2));
i += 6;
continue;
}

if (text.charAt(i + 6) != '%') {
throw new IllegalArgumentException();
}

int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8));
if ((byte3 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
byte3 = byte3 & 0x3F;
if ((byte1 & 0xF0) == 0xE0) {
// unpaired surrogate are fine here
decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3));
i += 9;
continue;
}

if (text.charAt(i + 9) != '%') {
throw new IllegalArgumentException();
}

int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11));
if ((byte4 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
byte4 = byte4 & 0x3F;
if ((byte1 & 0xF8) == 0xF0) {
int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800));
decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF));
i += 12;
continue;
}
}

throw new IllegalArgumentException();
}

return decoded.toString();
}

/**
* Given the original text1, and an encoded string which describes the
* operations required to transform text1 into text2, compute the full diff.
Expand All @@ -1483,10 +1622,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
// decode would change all "+" to " "
param = param.replace("+", "%2B");
try {
param = URLDecoder.decode(param, "UTF-8");
} catch (UnsupportedEncodingException e) {
// Not likely on modern system.
throw new Error("This system does not support UTF-8.", e);
param = this.decodeURI(param);
} catch (IllegalArgumentException e) {
// Malformed URI sequence.
throw new IllegalArgumentException(
Expand Down Expand Up @@ -2269,10 +2405,7 @@ public List<Patch> patch_fromText(String textline)
line = text.getFirst().substring(1);
line = line.replace("+", "%2B"); // decode would change all "+" to " "
try {
line = URLDecoder.decode(line, "UTF-8");
} catch (UnsupportedEncodingException e) {
// Not likely on modern system.
throw new Error("This system does not support UTF-8.", e);
line = this.decodeURI(line);
} catch (IllegalArgumentException e) {
// Malformed URI sequence.
throw new IllegalArgumentException(
Expand Down
36 changes: 36 additions & 0 deletions java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,42 @@ public static void testDiffDelta() {

assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta));

diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b"));
delta = dmp.diff_toDelta(diffs);
assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta);

assertEquals(
"diff_toDelta: insert surrogate pair between similar high surrogates",
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71")))
);

assertEquals(
"diff_toDelta: swap surrogate pairs delete/insert",
dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71")))
);

assertEquals(
"diff_toDelta: swap surrogate pairs insert/delete",
dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71")))
);

assertEquals(
"diff_toDelta: empty diff groups",
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))),
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk")))
);

// Different versions of the library may have created deltas with
// half of a surrogate pair encoded as if it were valid UTF-8
assertEquals(
"diff_toDelta: surrogate half encoded as UTF8",
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")),
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
);

// Verify pool of unchanged characters.
diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
String text2 = dmp.diff_text2(diffs);
Expand Down
Loading