From 3518c542770fb5f683f2dcb0f8844ba47c98cf53 Mon Sep 17 00:00:00 2001 From: Vincent Cloutier Date: Fri, 30 Jun 2023 15:01:40 -0400 Subject: [PATCH] fix url parsing --- .../Regexes/HashtagRegexes.cs | 2 +- .../Regexes/UserRegexes.cs | 2 +- .../Tools/StatusExtractor.cs | 7 +++-- .../Tools/StatusExtractorTests.cs | 30 ++++++++++++++++--- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs b/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs index b6d9d00..30df21a 100644 --- a/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs +++ b/src/BirdsiteLive.Common/Regexes/HashtagRegexes.cs @@ -5,6 +5,6 @@ namespace BirdsiteLive.Common.Regexes public class HashtagRegexes { public static readonly Regex HashtagName = new Regex(@"^[a-zA-Z0-9_]+$"); - public static readonly Regex Hashtag = new Regex(@"(.?)#([a-zA-Z0-9_]+)(\s|$|[\[\]<>.,;:!?/|-])"); + public static readonly Regex Hashtag = new Regex(@"(^|.?[ \n]+)#([a-zA-Z0-9_]+)(?=\s|$|[\[\]<>.,;:!?/|-])"); } } \ No newline at end of file diff --git a/src/BirdsiteLive.Common/Regexes/UserRegexes.cs b/src/BirdsiteLive.Common/Regexes/UserRegexes.cs index 9a0569e..01cd579 100644 --- a/src/BirdsiteLive.Common/Regexes/UserRegexes.cs +++ b/src/BirdsiteLive.Common/Regexes/UserRegexes.cs @@ -5,6 +5,6 @@ namespace BirdsiteLive.Common.Regexes public class UserRegexes { public static readonly Regex TwitterAccount = new Regex(@"^[a-zA-Z0-9_]+$"); - public static readonly Regex Mention = new Regex(@"(.?)@([a-zA-Z0-9_]+)(\s|$|[\[\]<>,;:'\.’!?/—\|-]|(. ))"); + public static readonly Regex Mention = new Regex(@"(^|.?[ \n\.]+)@([a-zA-Z0-9_]+)(?=\s|$|[\[\]<>,;:'\.’!?/—\|-]|(. ))"); } } \ No newline at end of file diff --git a/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs b/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs index 0761265..bebb333 100644 --- a/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs +++ b/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs @@ -32,9 +32,6 @@ namespace BirdsiteLive.Domain.Tools { var tags = new List(); - // Replace return lines - messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "

"); - messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
"); // Extract Urls @@ -124,6 +121,10 @@ namespace BirdsiteLive.Domain.Tools $@"{m.Groups[1]}@{mention.ToLower()}{m.Groups[3]}"); } } + + // Replace return lines + messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "

"); + messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "
"); return (messageContent.Trim(), tags.ToArray()); } diff --git a/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs b/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs index 0460727..7255afd 100644 --- a/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs +++ b/src/Tests/BirdsiteLive.Domain.Tests/Tools/StatusExtractorTests.cs @@ -111,7 +111,6 @@ namespace BirdsiteLive.Domain.Tests.Tools Assert.IsTrue(result.content.Contains(@"https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content")); #endregion } - [Ignore] [TestMethod] public void Extract_FormatUrl_Long2_Test() { @@ -128,7 +127,29 @@ namespace BirdsiteLive.Domain.Tests.Tools #region Validations logger.VerifyAll(); - Assert.AreEqual(result.content, @"https://www.eff.org/deeplinks/2020/07/pact-act-not-solution-problem-harmful-online-content"); + Assert.AreEqual(result.content, @"https://twitterisgoinggreat.com/#twitters-first-dollar15bn-interest-payment-could-be-due-in-two-weeks"); + Assert.AreEqual(0, result.tags.Length); + + #endregion + } + + [TestMethod] + public void Extract_FormatUrl_Long3_Test() + { + #region Stubs + var message = $"https://domain.name/@WeekInEthNews/1668684659855880193"; + #endregion + + #region Mocks + var logger = new Mock>(); + #endregion + + var service = new StatusExtractor(_settings, logger.Object); + var result = service.Extract(message); + + #region Validations + logger.VerifyAll(); + Assert.AreEqual(result.content, @"https://domain.name/@WeekInEthNews/1668684659855880193"); Assert.AreEqual(0, result.tags.Length); #endregion @@ -633,7 +654,7 @@ namespace BirdsiteLive.Domain.Tests.Tools public void Extract_Emoji_Test() { #region Stubs - var message = $"😤@mynickname 😎😍🤗🤩😘"; + var message = $"😤 @mynickname 😎😍🤗🤩😘"; //var message = $"tests@mynickname"; #endregion @@ -648,12 +669,13 @@ namespace BirdsiteLive.Domain.Tests.Tools logger.VerifyAll(); Assert.AreEqual(1, result.tags.Length); Assert.IsTrue(result.content.Contains( - @"😤@mynickname")); + @"😤 @mynickname")); Assert.IsTrue(result.content.Contains(@"😎😍🤗🤩😘")); #endregion } + [Ignore] [TestMethod] public void Extract_Parenthesis_Test() {