2020-07-31 22:49:00 -04:00
|
|
|
|
using System.Collections.Generic;
|
2020-08-03 02:10:20 -04:00
|
|
|
|
using System.Linq;
|
2020-07-31 22:49:00 -04:00
|
|
|
|
using System.Text.RegularExpressions;
|
|
|
|
|
using BirdsiteLive.ActivityPub.Models;
|
|
|
|
|
using BirdsiteLive.Common.Settings;
|
|
|
|
|
|
|
|
|
|
namespace BirdsiteLive.Domain.Tools
|
2020-07-31 22:13:52 -04:00
|
|
|
|
{
|
2020-07-31 22:49:00 -04:00
|
|
|
|
public interface IStatusExtractor
|
2020-07-31 22:13:52 -04:00
|
|
|
|
{
|
2020-07-31 22:49:00 -04:00
|
|
|
|
(string content, Tag[] tags) ExtractTags(string messageContent);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class StatusExtractor : IStatusExtractor
|
|
|
|
|
{
|
2020-07-31 23:16:23 -04:00
|
|
|
|
private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)");
|
2020-08-03 02:10:20 -04:00
|
|
|
|
//private readonly Regex _hastagRegex = new Regex(@"#\w+");
|
2020-08-01 00:00:27 -04:00
|
|
|
|
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)");
|
|
|
|
|
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+)\b(?!;)");
|
2020-08-03 02:10:20 -04:00
|
|
|
|
|
2020-07-31 23:16:23 -04:00
|
|
|
|
private readonly Regex _mentionRegex = new Regex(@"\W(\@[a-zA-Z0-9_ー]+\b)(?!;)");
|
2020-08-03 02:10:20 -04:00
|
|
|
|
//private readonly Regex _mentionRegex = new Regex(@"@\w+");
|
2020-08-01 00:00:27 -04:00
|
|
|
|
//private readonly Regex _mentionRegex = new Regex(@"(?<=[\s>]|^)@(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)");
|
|
|
|
|
//private readonly Regex _mentionRegex = new Regex(@"(?<=[\s>]|^)@(\w*[a-zA-Z0-9_ー]+)\b(?!;)");
|
2020-08-03 02:10:20 -04:00
|
|
|
|
|
2020-08-01 13:56:59 -04:00
|
|
|
|
private readonly Regex _urlRegex = new Regex(@"((http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)");
|
2021-01-11 19:50:08 -05:00
|
|
|
|
|
|
|
|
|
private readonly Regex _emojiRegex = new Regex(EmojiPattern);
|
|
|
|
|
|
2020-07-31 22:49:00 -04:00
|
|
|
|
private readonly InstanceSettings _instanceSettings;
|
|
|
|
|
|
|
|
|
|
#region Ctor
|
|
|
|
|
public StatusExtractor(InstanceSettings instanceSettings)
|
|
|
|
|
{
|
|
|
|
|
_instanceSettings = instanceSettings;
|
|
|
|
|
}
|
|
|
|
|
#endregion
|
|
|
|
|
|
|
|
|
|
public (string content, Tag[] tags) ExtractTags(string messageContent)
|
|
|
|
|
{
|
|
|
|
|
var tags = new List<Tag>();
|
2020-08-01 00:00:27 -04:00
|
|
|
|
messageContent = $" {messageContent} ";
|
2020-07-31 23:03:20 -04:00
|
|
|
|
|
2020-08-02 19:40:02 -04:00
|
|
|
|
// Replace return lines
|
2020-08-03 02:10:20 -04:00
|
|
|
|
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p> ");
|
|
|
|
|
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/> ");
|
2020-08-02 19:40:02 -04:00
|
|
|
|
|
2021-01-11 19:50:08 -05:00
|
|
|
|
// Secure emojis
|
|
|
|
|
var emojiMatch = _emojiRegex.Matches(messageContent);
|
|
|
|
|
foreach (Match m in emojiMatch)
|
|
|
|
|
messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} ");
|
|
|
|
|
|
2020-08-01 13:56:59 -04:00
|
|
|
|
// Extract Urls
|
|
|
|
|
var urlMatch = _urlRegex.Matches(messageContent);
|
2020-08-03 02:10:20 -04:00
|
|
|
|
foreach (Match m in urlMatch)
|
2020-08-01 13:56:59 -04:00
|
|
|
|
{
|
|
|
|
|
var url = m.ToString().Replace("\n", string.Empty).Trim();
|
|
|
|
|
|
|
|
|
|
var protocol = "https://";
|
|
|
|
|
if (url.StartsWith("http://")) protocol = "http://";
|
|
|
|
|
else if (url.StartsWith("ftp://")) protocol = "ftp://";
|
|
|
|
|
|
|
|
|
|
var truncatedUrl = url.Replace(protocol, string.Empty);
|
|
|
|
|
|
|
|
|
|
if (truncatedUrl.StartsWith("www."))
|
|
|
|
|
{
|
|
|
|
|
protocol += "www.";
|
|
|
|
|
truncatedUrl = truncatedUrl.Replace("www.", string.Empty);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var firstPart = truncatedUrl;
|
|
|
|
|
var secondPart = string.Empty;
|
|
|
|
|
|
|
|
|
|
if (truncatedUrl.Length > 30)
|
|
|
|
|
{
|
|
|
|
|
firstPart = truncatedUrl.Substring(0, 30);
|
|
|
|
|
secondPart = truncatedUrl.Substring(30);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
messageContent = Regex.Replace(messageContent, m.ToString(),
|
|
|
|
|
$@" <a href=""{url}"" rel=""nofollow noopener noreferrer"" target=""_blank""><span class=""invisible"">{protocol}</span><span class=""ellipsis"">{firstPart}</span><span class=""invisible"">{secondPart}</span></a>");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Extract Hashtags
|
2020-08-03 02:10:20 -04:00
|
|
|
|
var hashtagMatch = OrderByLength(_hastagRegex.Matches(messageContent));
|
|
|
|
|
foreach (Match m in hashtagMatch)
|
2020-07-31 22:49:00 -04:00
|
|
|
|
{
|
|
|
|
|
var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim();
|
|
|
|
|
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
|
|
|
|
|
|
|
|
|
|
tags.Add(new Tag
|
|
|
|
|
{
|
|
|
|
|
name = $"#{tag}",
|
|
|
|
|
href = url,
|
|
|
|
|
type = "Hashtag"
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
messageContent = Regex.Replace(messageContent, m.ToString(),
|
2020-07-31 23:13:17 -04:00
|
|
|
|
$@" <a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>");
|
2020-07-31 22:49:00 -04:00
|
|
|
|
}
|
|
|
|
|
|
2020-08-01 13:56:59 -04:00
|
|
|
|
// Extract Mentions
|
2020-08-03 02:10:20 -04:00
|
|
|
|
var mentionMatch = OrderByLength(_mentionRegex.Matches(messageContent));
|
|
|
|
|
foreach (Match m in mentionMatch)
|
2020-07-31 23:03:20 -04:00
|
|
|
|
{
|
|
|
|
|
var mention = m.ToString().Replace("@", string.Empty).Replace("\n", string.Empty).Trim();
|
|
|
|
|
var url = $"https://{_instanceSettings.Domain}/users/{mention}";
|
|
|
|
|
var name = $"@{mention}@{_instanceSettings.Domain}";
|
|
|
|
|
|
|
|
|
|
tags.Add(new Tag
|
|
|
|
|
{
|
|
|
|
|
name = name,
|
|
|
|
|
href = url,
|
|
|
|
|
type = "Mention"
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
messageContent = Regex.Replace(messageContent, m.ToString(),
|
2020-07-31 23:13:17 -04:00
|
|
|
|
$@" <span class=""h-card""><a href=""https://{_instanceSettings.Domain}/@{mention}"" class=""u-url mention"">@<span>{mention}</span></a></span>");
|
2020-07-31 23:03:20 -04:00
|
|
|
|
}
|
|
|
|
|
|
2020-08-03 02:10:20 -04:00
|
|
|
|
// Clean up return lines
|
|
|
|
|
messageContent = Regex.Replace(messageContent, @"<p> ", "<p>");
|
|
|
|
|
messageContent = Regex.Replace(messageContent, @"<br/> ", "<br/>");
|
2021-01-11 19:50:08 -05:00
|
|
|
|
messageContent = Regex.Replace(messageContent, @" ", " ");
|
|
|
|
|
messageContent = Regex.Replace(messageContent, @" ", " ");
|
2020-08-03 02:10:20 -04:00
|
|
|
|
|
2020-08-01 00:00:27 -04:00
|
|
|
|
return (messageContent.Trim(), tags.ToArray());
|
2020-07-31 22:49:00 -04:00
|
|
|
|
}
|
2020-08-03 02:10:20 -04:00
|
|
|
|
|
|
|
|
|
private IEnumerable<Match> OrderByLength(MatchCollection matches)
|
|
|
|
|
{
|
|
|
|
|
var result = new List<Match>();
|
|
|
|
|
|
|
|
|
|
foreach (Match m in matches) result.Add(m);
|
|
|
|
|
result = result.OrderByDescending(x => x.Length).ToList();
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
2021-01-11 19:50:08 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private const string EmojiPattern = @"(?:\uD83D(?:\uDD73\uFE0F?|\uDC41(?:(?:\uFE0F(?:\u200D\uD83D\uDDE8\uFE0F?)?|\u200D\uD83D\uDDE8\uFE0F?))?|[\uDDE8\uDDEF]\uFE0F?|\uDC4B(?:\uD83C[\uDFFB-\uDFFF])?|\uDD90(?:(?:\uD83C[\uDFFB-\uDFFF]|\uFE0F))?|[\uDD96\uDC4C\uDC48\uDC49\uDC46\uDD95\uDC47\uDC4D\uDC4E\uDC4A\uDC4F\uDE4C\uDC50\uDE4F\uDC85\uDCAA\uDC42\uDC43\uDC76\uDC66\uDC67](?:\uD83C[\uDFFB-\uDFFF])?|\uDC71(?:(?:\uD83C(?:[\uDFFB-\uDFFF](?:\u200D(?:[\u2640\u2642]\uFE0F?))?)|\u200D(?:[\u2640\u2642]\uFE0F?)))?|\uDC68(?:(?:\uD83C(?:\uDFFB(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D\uDC68\uD83C[\uDFFC-\uDFFF]|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFC(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D\uDC68\uD83C[\uDFFB\uDFFD-\uDFFF]|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFD(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D\uDC68\uD83C[\uDFFB\uDFFC\uDFFE\uDFFF]|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFE(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D\uDC68\uD83C[\uDFFB-\uDFFD\uDFFF]|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFF(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D\uDC68\uD83C[\uDFFB-\uDFFE]|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?)|\u200D(?:\uD83E[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD]|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D(?:\uDC69\u200D\uD83D(?:\uDC66(?:\u200D\uD83D\uDC66)?|\uDC67(?:\u200D\uD83D[\uDC66\uDC67])?)|\uDC68\u200D\uD83D(?:\uDC66(?:\u200D\uD83D\uDC66)?|\uDC67(?:\u200D\uD83D[\uDC66\uDC67])?)|\uDC66(?:\u200D\uD83D\uDC66)?|\uDC67(?:\u200D\uD83D[\uDC66\uDC67])?|[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92])|\u2708\uFE0F?|\u2764(?:\uFE0F\u200D\uD83D(?:\uDC8B\u200D\uD83D\uDC68|\uDC68)|\u200D\uD83D(?:\uDC8B\u200D\uD83D\uDC68|\uDC68)))))?|\uDC69(?:(?:\uD83C(?:\uDFFB(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D(?:\uDC69\uD83C[\uDFFC-\uDFFF]|\uDC68\uD83C[\uDFFC-\uDFFF])|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFC(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D(?:\uDC69\uD83C[\uDFFB\uDFFD-\uDFFF]|\uDC68\uD83C[\uDFFB\uDFFD-\uDFFF])|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFD(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D(?:\uDC69\uD83C[\uDFFB\uDFFC\uDFFE\uDFFF]|\uDC68\uD83C[\uDFFB\uDFFC\uDFFE\uDFFF])|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFE(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D(?:\uDC69\uD83C[\uDFFB-\uDFFD\uDFFF]|\uDC68\uD83C[\uDFFB-\uDFFD\uDFFF])|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?|\uDFFF(?:\u200D(?:\uD83E(?:\uDD1D\u200D\uD83D(?:\uDC69\uD83C[\uDFFB-\uDFFE]|\uDC68\uD83C[\uDFFB-\uDFFE])|[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD])|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD83D[\uDD27\uDCBC\uDD2C\uDCBB\uDE80\uDE92]|\u2708\uFE0F?))?)|\u200D(?:\uD83E[\uDDB0\uDDB1\uDDB3\uDDB2\uDDAF\uDDBC\uDDBD]|\u2695\uFE0F?|\uD83C[\uDF93\uDFEB\uDF3E\uDF73\uDFED\uDFA4\uDFA8]|\u2696\uFE0F?|\uD8
|
2020-07-31 22:13:52 -04:00
|
|
|
|
}
|
|
|
|
|
}
|