Update URL homograph rules.

This commit is contained in:
Greyson Parrelli 2019-03-18 17:37:19 -07:00
parent ab2e85f6c7
commit 5b534c8b1a
3 changed files with 67 additions and 109 deletions

View File

@ -1,99 +0,0 @@
package org.thoughtcrime.securesms.linkpreview;
import java.util.regex.Pattern;
public class LegalUrlPatterns {
public static final Pattern LATIN = Pattern.compile("[" +
"\\x{0041}-\\x{005A}" +
"\\x{0061}-\\x{007A}" +
"\\x{00AA}" +
"\\x{00BA}" +
"\\x{00C0}-\\x{00DC}" +
"\\x{00D8}-\\x{00F6}" +
"\\x{00F8}-\\x{01BA}" +
"]");
public static final Pattern CYRILLIC = Pattern.compile("[" +
"\\x{0400}-\\x{0481}" +
"\\x{0482}" +
"\\x{0483}-\\x{0484}" +
"\\x{0487}" +
"\\x{0488}-\\x{0489}" +
"\\x{048A}-\\x{052F}" +
"\\x{1C80}-\\x{1C88}" +
"\\x{1D2B}" +
"\\x{1D78}" +
"\\x{2DE0}-\\x{2DFF}" +
"\\x{A640}-\\x{A66D}" +
"\\x{A66E}" +
"\\x{A66F}" +
"\\x{A670}-\\x{A672}" +
"\\x{A673}" +
"\\x{A674}-\\x{A67D}" +
"\\x{A67E}" +
"\\x{A67F}" +
"\\x{A680}-\\x{A69B}" +
"\\x{A69C}-\\x{A69D}" +
"\\x{A69E}-\\x{A69F}" +
"\\x{FE2E}-\\x{FE2F}" +
"]");
public static final Pattern GREEK = Pattern.compile("[" +
"\\x{0370}-\\x{0373}" +
"\\x{0375}" +
"\\x{0376}-\\x{0377}" +
"\\x{037A}" +
"\\x{037B}-\\x{037D}" +
"\\x{037F}" +
"\\x{0384}" +
"\\x{0386}" +
"\\x{0388}-\\x{038A}" +
"\\x{038C}" +
"\\x{038E}-\\x{03A1}" +
"\\x{03A3}-\\x{03E1}" +
"\\x{03F0}-\\x{03F5}" +
"\\x{03F6}" +
"\\x{03F7}-\\x{03FF}" +
"\\x{1D26}-\\x{1D2A}" +
"\\x{1D5D}-\\x{1D61}" +
"\\x{1D66}-\\x{1D6A}" +
"\\x{1DBF}" +
"\\x{1F00}-\\x{1F15}" +
"\\x{1F18}-\\x{1F1D}" +
"\\x{1F20}-\\x{1F45}" +
"\\x{1F48}-\\x{1F4D}" +
"\\x{1F50}-\\x{1F57}" +
"\\x{1F59}" +
"\\x{1F5B}" +
"\\x{1F5D}" +
"\\x{1F5F}-\\x{1F7D}" +
"\\x{1F80}-\\x{1FB4}" +
"\\x{1FB6}-\\x{1FBC}" +
"\\x{1FBD}" +
"\\x{1FBE}" +
"\\x{1FBF}-\\x{1FC1}" +
"\\x{1FC2}-\\x{1FC4}" +
"\\x{1FC6}-\\x{1FCC}" +
"\\x{1FCD}-\\x{1FCF}" +
"\\x{1FD0}-\\x{1FD3}" +
"\\x{1FD6}-\\x{1FDB}" +
"\\x{1FDD}-\\x{1FDF}" +
"\\x{1FE0}-\\x{1FEC}" +
"\\x{1FED}-\\x{1FEF}" +
"\\x{1FF2}-\\x{1FF4}" +
"\\x{1FF6}-\\x{1FFC}" +
"\\x{1FFD}-\\x{1FFE}" +
"\\x{2126}" +
"\\x{AB65}" +
"\\x{10140}-\\x{10174}"+
"\\x{10175}-\\x{10178}"+
"\\x{10179}-\\x{10189}"+
"\\x{1018A}-\\x{1018B}"+
"\\x{1018C}-\\x{1018E}"+
"\\x{101A0}"+
"\\x{1D200}-\\x{1D241}"+
"\\x{1D242}-\\x{1D244}"+
"\\x{1D245}"+
"]");
}

View File

@ -10,11 +10,17 @@ import com.annimon.stream.Stream;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import okhttp3.HttpUrl;
public final class LinkPreviewUtil {
private static final Pattern DOMAIN_PATTERN = Pattern.compile("^(https?://)?([^/]+).*$");
private static final Pattern ALL_ASCII_PATTERN = Pattern.compile("^[\\x00-\\x7F]*$");
private static final Pattern ALL_NON_ASCII_PATTERN = Pattern.compile("^[^\\x00-\\x7F]*$");
/**
* @return All whitelisted URLs in the source text.
*/
@ -57,10 +63,16 @@ public final class LinkPreviewUtil {
}
public static boolean isLegalUrl(@NonNull String url) {
if (LegalUrlPatterns.LATIN.matcher(url).find()) {
return !LegalUrlPatterns.CYRILLIC.matcher(url).find() &&
!LegalUrlPatterns.GREEK.matcher(url).find();
Matcher matcher = DOMAIN_PATTERN.matcher(url);
if (matcher.matches()) {
String domain = matcher.group(2);
String cleanedDomain = domain.replaceAll("\\.", "");
return ALL_ASCII_PATTERN.matcher(cleanedDomain).matches() ||
ALL_NON_ASCII_PATTERN.matcher(cleanedDomain).matches();
} else {
return false;
}
return true;
}
}

View File

@ -8,17 +8,62 @@ import static junit.framework.TestCase.assertTrue;
public class LinkPreviewUtilTest {
@Test
public void isLegal_allLatin() {
assertTrue(LinkPreviewUtil.isLegalUrl("https://signal.org"));
public void isLegal_allAscii_noProtocol() {
assertTrue(LinkPreviewUtil.isLegalUrl("google.com"));
}
@Test
public void isLegal_latinAndCyrillic() {
assertFalse(LinkPreviewUtil.isLegalUrl("https://www.аmazon.com"));
public void isLegal_allAscii_noProtocol_subdomain() {
assertTrue(LinkPreviewUtil.isLegalUrl("foo.google.com"));
}
@Test
public void isLegal_latinAndGreek() {
assertFalse(LinkPreviewUtil.isLegalUrl("https://www.αpple.com"));
public void isLegal_allAscii_subdomain() {
assertTrue(LinkPreviewUtil.isLegalUrl("https://foo.google.com"));
}
@Test
public void isLegal_allAscii_subdomain_path() {
assertTrue(LinkPreviewUtil.isLegalUrl("https://foo.google.com/some/path.html"));
}
@Test
public void isLegal_cyrillicHostAsciiTld() {
assertFalse(LinkPreviewUtil.isLegalUrl("http://кц.com"));
}
@Test
public void isLegal_cyrillicHostAsciiTld_noProtocol() {
assertFalse(LinkPreviewUtil.isLegalUrl("кц.com"));
}
@Test
public void isLegal_mixedHost_noProtocol() {
assertFalse(LinkPreviewUtil.isLegalUrl("http://asĸ.com"));
}
@Test
public void isLegal_cyrillicHostAndTld_noProtocol() {
assertTrue(LinkPreviewUtil.isLegalUrl("кц.рф"));
}
@Test
public void isLegal_cyrillicHostAndTld_asciiPath_noProtocol() {
assertTrue(LinkPreviewUtil.isLegalUrl("кц.рф/some/path"));
}
@Test
public void isLegal_cyrillicHostAndTld_asciiPath() {
assertTrue(LinkPreviewUtil.isLegalUrl("https://кц.рф/some/path"));
}
@Test
public void isLegal_asciiSubdomain_cyrillicHostAndTld() {
assertFalse(LinkPreviewUtil.isLegalUrl("http://foo.кц.рф"));
}
@Test
public void isLegal_emptyUrl() {
assertFalse(LinkPreviewUtil.isLegalUrl(""));
}
}