2020-07-31 22:49:00 -04:00
|
|
|
|
using System.Collections.Generic;
|
2020-08-03 02:10:20 -04:00
|
|
|
|
using System.Linq;
|
2020-07-31 22:49:00 -04:00
|
|
|
|
using System.Text.RegularExpressions;
|
|
|
|
|
using BirdsiteLive.ActivityPub.Models;
|
2021-02-01 20:23:54 -05:00
|
|
|
|
using BirdsiteLive.Common.Regexes;
|
2020-07-31 22:49:00 -04:00
|
|
|
|
using BirdsiteLive.Common.Settings;
|
2021-02-01 21:48:47 -05:00
|
|
|
|
using BirdsiteLive.Twitter;
|
|
|
|
|
using Microsoft.Extensions.Logging;
|
2023-01-20 10:23:18 -05:00
|
|
|
|
using System;
|
2020-07-31 22:49:00 -04:00
|
|
|
|
|
|
|
|
|
namespace BirdsiteLive.Domain.Tools
|
2020-07-31 22:13:52 -04:00
|
|
|
|
{
|
2020-07-31 22:49:00 -04:00
|
|
|
|
public interface IStatusExtractor
|
2020-07-31 22:13:52 -04:00
|
|
|
|
{
|
2021-01-15 01:42:05 -05:00
|
|
|
|
(string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true);
|
2020-07-31 22:49:00 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class StatusExtractor : IStatusExtractor
|
|
|
|
|
{
|
|
|
|
|
private readonly InstanceSettings _instanceSettings;
|
2021-02-01 21:48:47 -05:00
|
|
|
|
private readonly ILogger<StatusExtractor> _logger;
|
2020-07-31 22:49:00 -04:00
|
|
|
|
|
|
|
|
|
#region Ctor
|
2021-02-01 21:48:47 -05:00
|
|
|
|
public StatusExtractor(InstanceSettings instanceSettings, ILogger<StatusExtractor> logger)
|
2020-07-31 22:49:00 -04:00
|
|
|
|
{
|
|
|
|
|
_instanceSettings = instanceSettings;
|
2021-02-01 21:48:47 -05:00
|
|
|
|
_logger = logger;
|
2020-07-31 22:49:00 -04:00
|
|
|
|
}
|
|
|
|
|
#endregion
|
|
|
|
|
|
2021-01-15 01:42:05 -05:00
|
|
|
|
public (string content, Tag[] tags) Extract(string messageContent, bool extractMentions = true)
|
2020-07-31 22:49:00 -04:00
|
|
|
|
{
|
|
|
|
|
var tags = new List<Tag>();
|
2020-07-31 23:03:20 -04:00
|
|
|
|
|
2020-08-02 19:40:02 -04:00
|
|
|
|
// Replace return lines
|
2021-02-01 21:48:47 -05:00
|
|
|
|
messageContent = Regex.Replace(messageContent, @"\r\n\r\n?|\n\n", "</p><p>");
|
|
|
|
|
messageContent = Regex.Replace(messageContent, @"\r\n?|\n", "<br/>");
|
2020-08-02 19:40:02 -04:00
|
|
|
|
|
2021-01-11 19:50:08 -05:00
|
|
|
|
|
2020-08-01 13:56:59 -04:00
|
|
|
|
// Extract Urls
|
2021-02-02 00:26:26 -05:00
|
|
|
|
var urlMatch = UrlRegexes.Url.Matches(messageContent);
|
2020-08-03 02:10:20 -04:00
|
|
|
|
foreach (Match m in urlMatch)
|
2020-08-01 13:56:59 -04:00
|
|
|
|
{
|
2021-02-10 00:19:12 -05:00
|
|
|
|
var url = m.Groups[2].ToString();
|
|
|
|
|
var protocol = m.Groups[3].ToString();
|
2020-08-01 13:56:59 -04:00
|
|
|
|
|
|
|
|
|
var truncatedUrl = url.Replace(protocol, string.Empty);
|
|
|
|
|
|
|
|
|
|
if (truncatedUrl.StartsWith("www."))
|
|
|
|
|
{
|
|
|
|
|
protocol += "www.";
|
|
|
|
|
truncatedUrl = truncatedUrl.Replace("www.", string.Empty);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var firstPart = truncatedUrl;
|
|
|
|
|
var secondPart = string.Empty;
|
|
|
|
|
|
|
|
|
|
if (truncatedUrl.Length > 30)
|
|
|
|
|
{
|
|
|
|
|
firstPart = truncatedUrl.Substring(0, 30);
|
|
|
|
|
secondPart = truncatedUrl.Substring(30);
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-11 00:45:55 -05:00
|
|
|
|
messageContent = Regex.Replace(messageContent, Regex.Escape(m.ToString()),
|
2021-02-10 00:19:12 -05:00
|
|
|
|
$@"{m.Groups[1]}<a href=""{url}"" rel=""nofollow noopener noreferrer"" target=""_blank""><span class=""invisible"">{protocol}</span><span class=""ellipsis"">{firstPart}</span><span class=""invisible"">{secondPart}</span></a>");
|
2020-08-01 13:56:59 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Extract Hashtags
|
2021-02-01 21:48:47 -05:00
|
|
|
|
var hashtagMatch = OrderByLength(HashtagRegexes.Hashtag.Matches(messageContent));
|
2021-01-13 02:03:51 -05:00
|
|
|
|
foreach (Match m in hashtagMatch.OrderByDescending(x => x.Length))
|
2020-07-31 22:49:00 -04:00
|
|
|
|
{
|
2021-02-01 21:48:47 -05:00
|
|
|
|
var tag = m.Groups[2].ToString();
|
|
|
|
|
|
|
|
|
|
if (!HashtagRegexes.HashtagName.IsMatch(tag))
|
|
|
|
|
{
|
|
|
|
|
_logger.LogError("Parsing Hashtag failed: {Tag} on {Content}", tag, messageContent);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2020-07-31 22:49:00 -04:00
|
|
|
|
|
2021-02-01 21:48:47 -05:00
|
|
|
|
var url = $"https://{_instanceSettings.Domain}/tags/{tag}";
|
2021-03-30 18:44:53 -04:00
|
|
|
|
|
|
|
|
|
if (tags.All(x => x.href != url))
|
2020-07-31 22:49:00 -04:00
|
|
|
|
{
|
2021-03-30 18:44:53 -04:00
|
|
|
|
tags.Add(new Tag
|
|
|
|
|
{
|
|
|
|
|
name = $"#{tag}",
|
|
|
|
|
href = url,
|
|
|
|
|
type = "Hashtag"
|
|
|
|
|
});
|
|
|
|
|
}
|
2020-07-31 22:49:00 -04:00
|
|
|
|
|
2021-02-02 00:24:33 -05:00
|
|
|
|
messageContent = Regex.Replace(messageContent, Regex.Escape(m.Groups[0].ToString()),
|
2021-02-01 21:48:47 -05:00
|
|
|
|
$@"{m.Groups[1]}<a href=""{url}"" class=""mention hashtag"" rel=""tag"">#<span>{tag}</span></a>{m.Groups[3]}");
|
2020-07-31 22:49:00 -04:00
|
|
|
|
}
|
|
|
|
|
|
2020-08-01 13:56:59 -04:00
|
|
|
|
// Extract Mentions
|
2021-01-15 01:42:05 -05:00
|
|
|
|
if (extractMentions)
|
2020-07-31 23:03:20 -04:00
|
|
|
|
{
|
2021-02-02 00:24:33 -05:00
|
|
|
|
var mentionMatch = OrderByLength(UserRegexes.Mention.Matches(messageContent));
|
2021-03-30 18:44:53 -04:00
|
|
|
|
foreach (Match m in mentionMatch)
|
2020-07-31 23:03:20 -04:00
|
|
|
|
{
|
2021-02-02 00:24:33 -05:00
|
|
|
|
var mention = m.Groups[2].ToString();
|
|
|
|
|
|
|
|
|
|
if (!UserRegexes.TwitterAccount.IsMatch(mention))
|
|
|
|
|
{
|
|
|
|
|
_logger.LogError("Parsing Mention failed: {Mention} on {Content}", mention, messageContent);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-20 10:23:18 -05:00
|
|
|
|
var url = $"https://{_instanceSettings.Domain}/users/{mention.ToLower()}";
|
2023-01-14 12:08:30 -05:00
|
|
|
|
var name = $"@{mention}";
|
2021-01-15 01:42:05 -05:00
|
|
|
|
|
2021-03-30 18:44:53 -04:00
|
|
|
|
if (tags.All(x => x.href != url))
|
2021-01-15 01:42:05 -05:00
|
|
|
|
{
|
2021-03-30 18:44:53 -04:00
|
|
|
|
tags.Add(new Tag
|
|
|
|
|
{
|
|
|
|
|
name = name,
|
|
|
|
|
href = url,
|
|
|
|
|
type = "Mention"
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-02 00:24:33 -05:00
|
|
|
|
messageContent = Regex.Replace(messageContent, Regex.Escape(m.Groups[0].ToString()),
|
2023-01-20 10:23:18 -05:00
|
|
|
|
$@"{m.Groups[1]}<span class=""h-card""><a href=""{url}"" class=""u-url mention"">@<span>{mention}</span></a></span>{m.Groups[3]}");
|
2021-01-15 01:42:05 -05:00
|
|
|
|
}
|
2020-07-31 23:03:20 -04:00
|
|
|
|
}
|
|
|
|
|
|
2020-08-01 00:00:27 -04:00
|
|
|
|
return (messageContent.Trim(), tags.ToArray());
|
2020-07-31 22:49:00 -04:00
|
|
|
|
}
|
2021-03-30 18:44:53 -04:00
|
|
|
|
|
2020-08-03 02:10:20 -04:00
|
|
|
|
private IEnumerable<Match> OrderByLength(MatchCollection matches)
|
|
|
|
|
{
|
|
|
|
|
var result = new List<Match>();
|
|
|
|
|
foreach (Match m in matches) result.Add(m);
|
2021-03-30 18:44:53 -04:00
|
|
|
|
|
|
|
|
|
result = result
|
|
|
|
|
.OrderBy(x => x.Length)
|
|
|
|
|
.GroupBy(p => p.Value)
|
|
|
|
|
.Select(g => g.First())
|
|
|
|
|
.ToList();
|
2020-08-03 02:10:20 -04:00
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
2020-07-31 22:13:52 -04:00
|
|
|
|
}
|
|
|
|
|
}
|