cloutier--bird.makeup/src/BirdsiteLive.Domain/Tools/StatusExtractor.cs

169 lines
7.4 KiB
C#
Raw Normal View History

2020-07-31 22:49:00 -04:00
using System.Collections.Generic;
2020-08-03 02:10:20 -04:00
using System.Linq;
2020-07-31 22:49:00 -04:00
using System.Text.RegularExpressions;
using BirdsiteLive.ActivityPub.Models;
2021-02-01 20:23:54 -05:00
using BirdsiteLive.Common.Regexes;
2020-07-31 22:49:00 -04:00
using BirdsiteLive.Common.Settings;
2021-02-01 21:48:47 -05:00
using BirdsiteLive.Twitter;
using Microsoft.Extensions.Logging;
2020-07-31 22:49:00 -04:00
namespace BirdsiteLive.Domain.Tools
2020-07-31 22:13:52 -04:00
{
2020-07-31 22:49:00 -04:00
public interface IStatusExtractor
2020-07-31 22:13:52 -04:00
{
(string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true);
2020-07-31 22:49:00 -04:00
}
public class StatusExtractor : IStatusExtractor
{
2021-02-01 21:48:47 -05:00
//private readonly Regex _hastagRegex = new Regex(@"\W(\#[a-zA-Z0-9_ー]+\b)(?!;)");
2020-08-03 02:10:20 -04:00
//private readonly Regex _hastagRegex = new Regex(@"#\w+");
2020-08-01 00:00:27 -04:00
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)");
//private readonly Regex _hastagRegex = new Regex(@"(?<=[\s>]|^)#(\w*[a-zA-Z0-9_ー]+)\b(?!;)");
2020-08-03 02:10:20 -04:00
2021-02-02 00:24:33 -05:00
//private readonly Regex _mentionRegex = new Regex(@"\W(\@[a-zA-Z0-9_ー]+\b)(?!;)");
2020-08-03 02:10:20 -04:00
//private readonly Regex _mentionRegex = new Regex(@"@\w+");
2020-08-01 00:00:27 -04:00
//private readonly Regex _mentionRegex = new Regex(@"(?<=[\s>]|^)@(\w*[a-zA-Z0-9_ー]+\w*)\b(?!;)");
//private readonly Regex _mentionRegex = new Regex(@"(?<=[\s>]|^)@(\w*[a-zA-Z0-9_ー]+)\b(?!;)");
2020-08-03 02:10:20 -04:00
2020-08-01 13:56:59 -04:00
private readonly Regex _urlRegex = new Regex(@"((http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)");
2021-01-11 19:50:08 -05:00
2020-07-31 22:49:00 -04:00
private readonly InstanceSettings _instanceSettings;
2021-02-01 21:48:47 -05:00
private readonly ILogger<StatusExtractor> _logger;
2020-07-31 22:49:00 -04:00
#region Ctor
2021-02-01 21:48:47 -05:00
public StatusExtractor(InstanceSettings instanceSettings, ILogger<StatusExtractor> logger)
2020-07-31 22:49:00 -04:00
{
_instanceSettings = instanceSettings;
2021-02-01 21:48:47 -05:00
_logger = logger;
2020-07-31 22:49:00 -04:00
}
#endregion
public (string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true)
2020-07-31 22:49:00 -04:00
{
var tags = new List<Tag>();
2021-02-01 21:48:47 -05:00
//messageContent = $" {messageContent} ";
2020-07-31 23:03:20 -04:00
2020-08-02 19:40:02 -04:00
// Replace return lines
2021-02-01 21:48:47 -05:00
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p>");
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/>");
//messageContent = Regex.Replace(messageContent, @"\(@", "( @");
//messageContent = Regex.Replace(messageContent, @"\(#", "( #");
2020-08-02 19:40:02 -04:00
2021-02-01 21:48:47 -05:00
//// Secure emojis
//var emojiMatch = EmojiRegexes.Emoji.Matches(messageContent);
//foreach (Match m in emojiMatch)
// messageContent = Regex.Replace(messageContent, m.ToString(), $" {m} ");
2021-01-11 19:50:08 -05:00
2020-08-01 13:56:59 -04:00
// Extract Urls
var urlMatch = _urlRegex.Matches(messageContent);
2020-08-03 02:10:20 -04:00
foreach (Match m in urlMatch)
2020-08-01 13:56:59 -04:00
{
var url = m.ToString().Replace("\n", string.Empty).Trim();
var protocol = "https://";
if (url.StartsWith("http://")) protocol = "http://";
else if (url.StartsWith("ftp://")) protocol = "ftp://";
var truncatedUrl = url.Replace(protocol, string.Empty);
if (truncatedUrl.StartsWith("www."))
{
protocol += "www.";
truncatedUrl = truncatedUrl.Replace("www.", string.Empty);
}
var firstPart = truncatedUrl;
var secondPart = string.Empty;
if (truncatedUrl.Length > 30)
{
firstPart = truncatedUrl.Substring(0, 30);
secondPart = truncatedUrl.Substring(30);
}
messageContent = Regex.Replace(messageContent, m.ToString(),
$@" <a href=""{url}"" rel=""nofollow noopener noreferrer"" target=""_blank""><span class=""invisible"">{protocol}</span><span class=""ellipsis"">{firstPart}</span><span class=""invisible"">{secondPart}</span></a>");
}
// Extract Hashtags
2021-02-01 21:48:47 -05:00
var hashtagMatch = OrderByLength(HashtagRegexes.Hashtag.Matches(messageContent));
2021-01-13 02:03:51 -05:00
foreach (Match m in hashtagMatch.OrderByDescending(x => x.Length))
2020-07-31 22:49:00 -04:00
{
2021-02-01 21:48:47 -05:00
var tag = m.Groups[2].ToString();
//var tag = m.ToString().Replace("#", string.Empty).Replace("\n", string.Empty).Trim();
if (!HashtagRegexes.HashtagName.IsMatch(tag))
{
_logger.LogError("Parsing Hashtag failed: {Tag} on {Content}", tag, messageContent);
continue;
}
2020-07-31 22:49:00 -04:00
2021-02-01 21:48:47 -05:00
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
2020-07-31 22:49:00 -04:00
tags.Add(new Tag
{
name = $"#{tag}",
href = url,
type = "Hashtag"
});
2021-02-01 21:48:47 -05:00
//messageContent = Regex.Replace(messageContent, m.ToString(),
// $@" <a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>");
2021-02-02 00:24:33 -05:00
messageContent = Regex.Replace(messageContent, Regex.Escape(m.Groups[0].ToString()),
2021-02-01 21:48:47 -05:00
$@"{m.Groups[1]}<a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>{m.Groups[3]}");
2020-07-31 22:49:00 -04:00
}
2020-08-01 13:56:59 -04:00
// Extract Mentions
if (extractMentions)
2020-07-31 23:03:20 -04:00
{
2021-02-02 00:24:33 -05:00
var mentionMatch = OrderByLength(UserRegexes.Mention.Matches(messageContent));
foreach (Match m in mentionMatch.OrderByDescending(x => x.Length))
2020-07-31 23:03:20 -04:00
{
2021-02-02 00:24:33 -05:00
var mention = m.Groups[2].ToString();
//var mention = m.ToString().Replace("@", string.Empty).Replace("\n", string.Empty).Trim();
if (!UserRegexes.TwitterAccount.IsMatch(mention))
{
_logger.LogError("Parsing Mention failed: {Mention} on {Content}", mention, messageContent);
continue;
}
var url = $"https://{_instanceSettings.Domain}/users/{mention}";
var name = $"@{mention}@{_instanceSettings.Domain}";
tags.Add(new Tag
{
name = name,
href = url,
type = "Mention"
});
2021-02-02 00:24:33 -05:00
//messageContent = Regex.Replace(messageContent, m.ToString(),
// $@" <span class=""h-card""><a href=""https://{_instanceSettings.Domain}/@{mention}"" class=""u-url mention"">@<span>{mention}</span></a></span>");
messageContent = Regex.Replace(messageContent, Regex.Escape(m.Groups[0].ToString()),
$@"{m.Groups[1]}<span class=""h-card""><a href=""https://{_instanceSettings.Domain}/@{mention}"" class=""u-url mention"">@<span>{mention}</span></a></span>{m.Groups[3]}");
}
2020-07-31 23:03:20 -04:00
}
2021-02-02 00:24:33 -05:00
//// Clean up return lines
//messageContent = Regex.Replace(messageContent, @"<p> ", "<p>");
//messageContent = Regex.Replace(messageContent, @"<br/> ", "<br/>");
//messageContent = Regex.Replace(messageContent, @" ", " ");
//messageContent = Regex.Replace(messageContent, @" ", " ");
2020-08-03 02:10:20 -04:00
2020-08-01 00:00:27 -04:00
return (messageContent.Trim(), tags.ToArray());
2020-07-31 22:49:00 -04:00
}
2020-08-03 02:10:20 -04:00
private IEnumerable<Match> OrderByLength(MatchCollection matches)
{
var result = new List<Match>();
foreach (Match m in matches) result.Add(m);
result = result.OrderByDescending(x => x.Length).ToList();
return result;
}
2020-07-31 22:13:52 -04:00
}
}