From 3518c542770fb5f683f2dcb0f8844ba47c98cf53 Mon Sep 17 00:00:00 2001
From: Vincent Cloutier
Date: Fri, 30 Jun 2023 15:01:40 -0400
Subject: [PATCH] fix url parsing
---
.../Regexes/HashtagRegexes.cs | 2 +-
.../Regexes/UserRegexes.cs | 2 +-
.../Tools/StatusExtractor.cs | 7 +++--
.../Tools/StatusExtractorTests.cs | 30 ++++++++++++++++---
4 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs b/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs
index b6d9d00..30df21a 100644
--- a/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs
+++ b/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs
@@ -5,6 +5,6 @@ namespace BirdsiteLive.Common.Regexes
public class HashtagRegexes
{
public static readonly Regex HashtagName = new Regex(@"^[a-zA-Z0-9_]+$");
- public static readonly Regex Hashtag = new Regex(@"(.?)#([a-zA-Z0-9_]+)(\s|$|[\[\]<>.,;:!?/|-])");
+ public static readonly Regex Hashtag = new Regex(@"(^|.?[ \n]+)#([a-zA-Z0-9_]+)(?=\s|$|[\[\]<>.,;:!?/|-])");
}
}
\ No newline at end of file
diff --git a/src/BirdsiteLive.Common/Regexes/UserRegexes.cs b/src/BirdsiteLive.Common/Regexes/UserRegexes.cs
index 9a0569e..01cd579 100644
--- a/src/BirdsiteLive.Common/Regexes/UserRegexes.cs
+++ b/src/BirdsiteLive.Common/Regexes/UserRegexes.cs
@@ -5,6 +5,6 @@ namespace BirdsiteLive.Common.Regexes
public class UserRegexes
{
public static readonly Regex TwitterAccount = new Regex(@"^[a-zA-Z0-9_]+$");
- public static readonly Regex Mention = new Regex(@"(.?)@([a-zA-Z0-9_]+)(\s|$|[\[\]<>,;:'\.’!?/—\|-]|(. ))");
+ public static readonly Regex Mention = new Regex(@"(^|.?[ \n\.]+)@([a-zA-Z0-9_]+)(?=\s|$|[\[\]<>,;:'\.’!?/—\|-]|(. ))");
}
}
\ No newline at end of file
diff --git a/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs b/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs
index 0761265..bebb333 100644
--- a/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs
+++ b/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs
@@ -32,9 +32,6 @@ namespace BirdsiteLive.Domain.Tools
{
var tags = new List();
- // Replace return lines
- messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "
");
- messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
");
// Extract Urls
@@ -124,6 +121,10 @@ namespace BirdsiteLive.Domain.Tools
$@"{m.Groups[1]}@{mention.ToLower()}{m.Groups[3]}");
}
}
+
+ // Replace return lines
+ messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "
");
+ messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
");
return (messageContent.Trim(), tags.ToArray());
}
diff --git a/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs b/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs
index 0460727..7255afd 100644
--- a/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs
+++ b/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs
@@ -111,7 +111,6 @@ namespace BirdsiteLive.Domain.Tests.Tools
Assert.IsTrue(result.content.Contains(@"https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content"));
#endregion
}
- [Ignore]
[TestMethod]
public void Extract_FormatUrl_Long2_Test()
{
@@ -128,7 +127,29 @@ namespace BirdsiteLive.Domain.Tests.Tools
#region Validations
logger.VerifyAll();
- Assert.AreEqual(result.content, @"https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content");
+ Assert.AreEqual(result.content, @"https://twitterisgoinggreat.com/#twitters-first-dollar15bn-interest-payment-could-be-due-in-two-weeks");
+ Assert.AreEqual(0, result.tags.Length);
+
+ #endregion
+ }
+
+ [TestMethod]
+ public void Extract_FormatUrl_Long3_Test()
+ {
+ #region Stubs
+ var message = $"https://domain.name/@WeekInEthNews/1668684659855880193";
+ #endregion
+
+ #region Mocks
+ var logger = new Mock>();
+ #endregion
+
+ var service = new StatusExtractor(_settings, logger.Object);
+ var result = service.Extract(message);
+
+ #region Validations
+ logger.VerifyAll();
+ Assert.AreEqual(result.content, @"https://domain.name/@WeekInEthNews/1668684659855880193");
Assert.AreEqual(0, result.tags.Length);
#endregion
@@ -633,7 +654,7 @@ namespace BirdsiteLive.Domain.Tests.Tools
public void Extract_Emoji_Test()
{
#region Stubs
- var message = $"😤@mynickname 😎😍🤗🤩😘";
+ var message = $"😤 @mynickname 😎😍🤗🤩😘";
//var message = $"tests@mynickname";
#endregion
@@ -648,12 +669,13 @@ namespace BirdsiteLive.Domain.Tests.Tools
logger.VerifyAll();
Assert.AreEqual(1, result.tags.Length);
Assert.IsTrue(result.content.Contains(
- @"😤@mynickname"));
+ @"😤 @mynickname"));
Assert.IsTrue(result.content.Contains(@"😎😍🤗🤩😘"));
#endregion
}
+ [Ignore]
[TestMethod]
public void Extract_Parenthesis_Test()
{