| | | 1 | | using System.Text; |
| | | 2 | | using System.Text.RegularExpressions; |
| | | 3 | | |
| | | 4 | | namespace Chronicis.Api.Services; |
| | | 5 | | |
| | | 6 | | /// <summary> |
| | | 7 | | /// Converts TipTap HTML content to Markdown for export. |
| | | 8 | | /// Pure text transformation with no external dependencies. |
| | | 9 | | /// </summary> |
| | | 10 | | public static partial class HtmlToMarkdownConverter |
| | | 11 | | { |
| | | 12 | | // ── Compiled regex patterns ────────────────────────────────────────────── |
| | | 13 | | |
| | | 14 | | [GeneratedRegex(@"<span[^>]*data-type=""wiki-link""[^>]*data-display=""([^""]+)""[^>]*>.*?</span>", RegexOptions.Ign |
| | | 15 | | private static partial Regex WikiLinkWithDisplay(); |
| | | 16 | | |
| | | 17 | | [GeneratedRegex(@"<span[^>]*data-type=""wiki-link""[^>]*>([^<]+)</span>", RegexOptions.IgnoreCase)] |
| | | 18 | | private static partial Regex WikiLinkPlain(); |
| | | 19 | | |
| | | 20 | | [GeneratedRegex(@"<strong[^>]*>(.*?)</strong>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 21 | | private static partial Regex StrongTag(); |
| | | 22 | | |
| | | 23 | | [GeneratedRegex(@"<b[^>]*>(.*?)</b>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 24 | | private static partial Regex BoldTag(); |
| | | 25 | | |
| | | 26 | | [GeneratedRegex(@"<em[^>]*>(.*?)</em>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 27 | | private static partial Regex EmTag(); |
| | | 28 | | |
| | | 29 | | [GeneratedRegex(@"<i[^>]*>(.*?)</i>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 30 | | private static partial Regex ItalicTag(); |
| | | 31 | | |
| | | 32 | | [GeneratedRegex(@"<a[^>]*href=""([^""]*)""[^>]*>(.*?)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 33 | | private static partial Regex AnchorTag(); |
| | | 34 | | |
| | | 35 | | [GeneratedRegex(@"<pre[^>]*><code[^>]*>([\s\S]*?)</code></pre>", RegexOptions.IgnoreCase)] |
| | | 36 | | private static partial Regex PreCodeBlock(); |
| | | 37 | | |
| | | 38 | | [GeneratedRegex(@"<code[^>]*>(.*?)</code>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 39 | | private static partial Regex InlineCode(); |
| | | 40 | | |
| | | 41 | | [GeneratedRegex(@"<blockquote[^>]*>([\s\S]*?)</blockquote>", RegexOptions.IgnoreCase)] |
| | | 42 | | private static partial Regex BlockquoteTag(); |
| | | 43 | | |
| | | 44 | | [GeneratedRegex(@"<p[^>]*>(.*?)</p>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 45 | | private static partial Regex ParagraphTag(); |
| | | 46 | | |
| | | 47 | | [GeneratedRegex(@"<ul[^>]*>([\s\S]*)</ul>", RegexOptions.IgnoreCase)] |
| | | 48 | | private static partial Regex UnorderedList(); |
| | | 49 | | |
| | | 50 | | [GeneratedRegex(@"<ol[^>]*>([\s\S]*)</ol>", RegexOptions.IgnoreCase)] |
| | | 51 | | private static partial Regex OrderedList(); |
| | | 52 | | |
| | | 53 | | [GeneratedRegex(@"<li[^>]*>", RegexOptions.IgnoreCase)] |
| | | 54 | | private static partial Regex ListItemOpen(); |
| | | 55 | | |
| | | 56 | | [GeneratedRegex(@"</li>", RegexOptions.IgnoreCase)] |
| | | 57 | | private static partial Regex ListItemClose(); |
| | | 58 | | |
| | | 59 | | [GeneratedRegex(@"(<[uo]l[^>]*>[\s\S]*</[uo]l>)", RegexOptions.IgnoreCase)] |
| | | 60 | | private static partial Regex NestedList(); |
| | | 61 | | |
| | | 62 | | [GeneratedRegex(@"<[^>]+>")] |
| | | 63 | | private static partial Regex AnyTag(); |
| | | 64 | | |
| | | 65 | | [GeneratedRegex(@"<br\s*/?>", RegexOptions.IgnoreCase)] |
| | | 66 | | private static partial Regex BreakTag(); |
| | | 67 | | |
| | | 68 | | [GeneratedRegex(@"<hr[^>]*/?>", RegexOptions.IgnoreCase)] |
| | | 69 | | private static partial Regex HorizontalRule(); |
| | | 70 | | |
| | | 71 | | [GeneratedRegex(@"\n{3,}")] |
| | | 72 | | private static partial Regex ExcessiveNewlines(); |
| | | 73 | | |
| | | 74 | | /// <summary> |
| | | 75 | | /// Header patterns for h1–h6. Index 0 = h1, index 5 = h6. |
| | | 76 | | /// </summary> |
| | 1 | 77 | | private static readonly Regex[] HeaderPatterns = |
| | 1 | 78 | | [ |
| | 1 | 79 | | H1Regex(), H2Regex(), H3Regex(), H4Regex(), H5Regex(), H6Regex(), |
| | 1 | 80 | | ]; |
| | | 81 | | |
| | | 82 | | [GeneratedRegex(@"<h1[^>]*>(.*?)</h1>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 83 | | private static partial Regex H1Regex(); |
| | | 84 | | |
| | | 85 | | [GeneratedRegex(@"<h2[^>]*>(.*?)</h2>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 86 | | private static partial Regex H2Regex(); |
| | | 87 | | |
| | | 88 | | [GeneratedRegex(@"<h3[^>]*>(.*?)</h3>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 89 | | private static partial Regex H3Regex(); |
| | | 90 | | |
| | | 91 | | [GeneratedRegex(@"<h4[^>]*>(.*?)</h4>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 92 | | private static partial Regex H4Regex(); |
| | | 93 | | |
| | | 94 | | [GeneratedRegex(@"<h5[^>]*>(.*?)</h5>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 95 | | private static partial Regex H5Regex(); |
| | | 96 | | |
| | | 97 | | [GeneratedRegex(@"<h6[^>]*>(.*?)</h6>", RegexOptions.IgnoreCase | RegexOptions.Singleline)] |
| | | 98 | | private static partial Regex H6Regex(); |
| | | 99 | | |
| | | 100 | | // ── Public API ─────────────────────────────────────────────────────────── |
| | | 101 | | |
| | | 102 | | /// <summary> |
| | | 103 | | /// Converts HTML content to Markdown. |
| | | 104 | | /// </summary> |
| | | 105 | | public static string Convert(string html) |
| | | 106 | | { |
| | 33 | 107 | | if (string.IsNullOrWhiteSpace(html)) |
| | 3 | 108 | | return string.Empty; |
| | | 109 | | |
| | 30 | 110 | | var markdown = html; |
| | | 111 | | |
| | 30 | 112 | | markdown = ConvertWikiLinks(markdown); |
| | 30 | 113 | | markdown = ConvertHeaders(markdown); |
| | 30 | 114 | | markdown = ConvertInlineFormatting(markdown); |
| | 30 | 115 | | markdown = ConvertLinks(markdown); |
| | 30 | 116 | | markdown = ConvertCodeBlocks(markdown); |
| | 30 | 117 | | markdown = ConvertBlockquotes(markdown); |
| | 30 | 118 | | markdown = ConvertLists(markdown); |
| | 30 | 119 | | markdown = ConvertParagraphsAndBreaks(markdown); |
| | 30 | 120 | | markdown = StripRemainingTags(markdown); |
| | 30 | 121 | | markdown = System.Net.WebUtility.HtmlDecode(markdown); |
| | 30 | 122 | | markdown = NormalizeWhitespace(markdown); |
| | | 123 | | |
| | 30 | 124 | | return markdown; |
| | | 125 | | } |
| | | 126 | | |
| | | 127 | | // ── Wiki Links ────────────────────────────────────────── |
| | | 128 | | |
| | | 129 | | internal static string ConvertWikiLinks(string html) |
| | | 130 | | { |
| | 30 | 131 | | var result = WikiLinkWithDisplay().Replace(html, "[[$1]]"); |
| | 30 | 132 | | return WikiLinkPlain().Replace(result, "[[$1]]"); |
| | | 133 | | } |
| | | 134 | | |
| | | 135 | | // ── Headers ───────────────────────────────────────────── |
| | | 136 | | |
| | | 137 | | internal static string ConvertHeaders(string html) |
| | | 138 | | { |
| | 36 | 139 | | var result = html; |
| | 504 | 140 | | for (int i = 0; i < 6; i++) |
| | | 141 | | { |
| | 216 | 142 | | var prefix = new string('#', i + 1); |
| | 216 | 143 | | result = HeaderPatterns[i].Replace(result, $"{prefix} $1\n\n"); |
| | | 144 | | } |
| | 36 | 145 | | return result; |
| | | 146 | | } |
| | | 147 | | |
| | | 148 | | // ── Inline Formatting ─────────────────────────────────── |
| | | 149 | | |
| | | 150 | | internal static string ConvertInlineFormatting(string html) |
| | | 151 | | { |
| | 31 | 152 | | var result = StrongTag().Replace(html, "**$1**"); |
| | 31 | 153 | | result = BoldTag().Replace(result, "**$1**"); |
| | 31 | 154 | | result = EmTag().Replace(result, "*$1*"); |
| | 31 | 155 | | return ItalicTag().Replace(result, "*$1*"); |
| | | 156 | | } |
| | | 157 | | |
| | | 158 | | // ── Links ─────────────────────────────────────────────── |
| | | 159 | | |
| | | 160 | | internal static string ConvertLinks(string html) |
| | 30 | 161 | | => AnchorTag().Replace(html, "[$2]($1)"); |
| | | 162 | | |
| | | 163 | | // ── Code ──────────────────────────────────────────────── |
| | | 164 | | |
| | | 165 | | internal static string ConvertCodeBlocks(string html) |
| | | 166 | | { |
| | 30 | 167 | | var result = PreCodeBlock().Replace(html, "```\n$1\n```\n\n"); |
| | 30 | 168 | | return InlineCode().Replace(result, "`$1`"); |
| | | 169 | | } |
| | | 170 | | |
| | | 171 | | // ── Blockquotes ───────────────────────────────────────── |
| | | 172 | | |
| | | 173 | | internal static string ConvertBlockquotes(string html) |
| | | 174 | | { |
| | 30 | 175 | | return BlockquoteTag().Replace(html, m => |
| | 30 | 176 | | { |
| | 30 | 177 | | var content = m.Groups[1].Value; |
| | 30 | 178 | | content = ParagraphTag().Replace(content, "$1"); |
| | 30 | 179 | | var lines = content.Split('\n') |
| | 30 | 180 | | .Select(l => "> " + l.Trim()) |
| | 30 | 181 | | .Where(l => l != "> "); |
| | 30 | 182 | | return string.Join("\n", lines) + "\n\n"; |
| | 30 | 183 | | }); |
| | | 184 | | } |
| | | 185 | | |
| | | 186 | | // ── Lists ─────────────────────────────────────────────── |
| | | 187 | | |
| | | 188 | | internal static string ConvertLists(string html) |
| | | 189 | | { |
| | 30 | 190 | | var result = html; |
| | 30 | 191 | | var previous = ""; |
| | | 192 | | |
| | | 193 | | // Keep processing until no more changes (handles deep nesting) |
| | 66 | 194 | | while (result != previous) |
| | | 195 | | { |
| | 36 | 196 | | previous = result; |
| | | 197 | | |
| | | 198 | | // Use greedy match so outermost list is captured first; |
| | | 199 | | // ProcessList handles nested <ul>/<ol> recursively within each <li>. |
| | 36 | 200 | | result = UnorderedList().Replace(result, |
| | 36 | 201 | | m => ProcessList(m.Groups[1].Value, ordered: false, indentLevel: 0)); |
| | 36 | 202 | | result = OrderedList().Replace(result, |
| | 36 | 203 | | m => ProcessList(m.Groups[1].Value, ordered: true, indentLevel: 0)); |
| | | 204 | | } |
| | | 205 | | |
| | 30 | 206 | | return result; |
| | | 207 | | } |
| | | 208 | | |
| | | 209 | | internal static string ProcessList(string listContent, bool ordered, int indentLevel) |
| | | 210 | | { |
| | 14 | 211 | | var sb = new StringBuilder(); |
| | 14 | 212 | | var indent = new string(' ', indentLevel * 2); |
| | 14 | 213 | | var counter = 1; |
| | | 214 | | |
| | | 215 | | // Extract list items accounting for nested lists. |
| | | 216 | | // We track <li> depth so we match the correct closing </li>. |
| | 14 | 217 | | var items = ExtractListItems(listContent); |
| | | 218 | | |
| | 70 | 219 | | foreach (var itemContent in items) |
| | | 220 | | { |
| | 21 | 221 | | var (textContent, nestedListHtml) = SplitNestedList(itemContent); |
| | | 222 | | |
| | 21 | 223 | | textContent = StripInlineTags(textContent); |
| | | 224 | | |
| | 21 | 225 | | var prefix = ordered ? $"{counter}. " : "- "; |
| | 21 | 226 | | sb.AppendLine($"{indent}{prefix}{textContent}"); |
| | 21 | 227 | | counter++; |
| | | 228 | | |
| | 21 | 229 | | if (!string.IsNullOrEmpty(nestedListHtml)) |
| | | 230 | | { |
| | 4 | 231 | | sb.Append(RenderNestedList(nestedListHtml, indentLevel + 1)); |
| | | 232 | | } |
| | | 233 | | } |
| | | 234 | | |
| | 14 | 235 | | if (indentLevel == 0) |
| | 9 | 236 | | sb.AppendLine(); |
| | | 237 | | |
| | 14 | 238 | | return sb.ToString(); |
| | | 239 | | } |
| | | 240 | | |
| | | 241 | | private static List<string> ExtractListItems(string listContent) |
| | | 242 | | { |
| | 14 | 243 | | var items = new List<string>(); |
| | 14 | 244 | | var liOpens = ListItemOpen().Matches(listContent); |
| | | 245 | | |
| | 72 | 246 | | foreach (Match openMatch in liOpens) |
| | | 247 | | { |
| | 22 | 248 | | var start = openMatch.Index + openMatch.Length; |
| | 22 | 249 | | var depth = 1; |
| | 22 | 250 | | var pos = start; |
| | | 251 | | |
| | 51 | 252 | | while (pos < listContent.Length && depth > 0) |
| | | 253 | | { |
| | 30 | 254 | | var nextOpen = ListItemOpen().Match(listContent[pos..]); |
| | 30 | 255 | | var nextClose = ListItemClose().Match(listContent[pos..]); |
| | | 256 | | |
| | 30 | 257 | | if (!nextClose.Success) |
| | | 258 | | break; |
| | | 259 | | |
| | 29 | 260 | | if (nextOpen.Success && nextOpen.Index < nextClose.Index) |
| | | 261 | | { |
| | 4 | 262 | | depth++; |
| | 4 | 263 | | pos += nextOpen.Index + nextOpen.Length; |
| | | 264 | | } |
| | | 265 | | else |
| | | 266 | | { |
| | 25 | 267 | | depth--; |
| | 25 | 268 | | if (depth == 0) |
| | | 269 | | { |
| | 21 | 270 | | items.Add(listContent[start..(pos + nextClose.Index)]); |
| | | 271 | | } |
| | 25 | 272 | | pos += nextClose.Index + nextClose.Length; |
| | | 273 | | } |
| | | 274 | | } |
| | | 275 | | } |
| | | 276 | | |
| | 14 | 277 | | return items; |
| | | 278 | | } |
| | | 279 | | |
| | | 280 | | private static (string text, string nestedHtml) SplitNestedList(string itemContent) |
| | | 281 | | { |
| | 21 | 282 | | var nestedMatch = NestedList().Match(itemContent); |
| | 21 | 283 | | if (!nestedMatch.Success) |
| | 17 | 284 | | return (itemContent, ""); |
| | | 285 | | |
| | 4 | 286 | | var text = itemContent[..nestedMatch.Index]; |
| | 4 | 287 | | return (text, nestedMatch.Groups[1].Value); |
| | | 288 | | } |
| | | 289 | | |
| | | 290 | | private static string RenderNestedList(string nestedHtml, int indentLevel) |
| | | 291 | | { |
| | 4 | 292 | | var sb = new StringBuilder(); |
| | | 293 | | |
| | 4 | 294 | | var ulMatch = UnorderedList().Match(nestedHtml); |
| | 4 | 295 | | if (ulMatch.Success) |
| | 2 | 296 | | sb.Append(ProcessList(ulMatch.Groups[1].Value, ordered: false, indentLevel)); |
| | | 297 | | |
| | 4 | 298 | | var olMatch = OrderedList().Match(nestedHtml); |
| | 4 | 299 | | if (olMatch.Success) |
| | 2 | 300 | | sb.Append(ProcessList(olMatch.Groups[1].Value, ordered: true, indentLevel)); |
| | | 301 | | |
| | 4 | 302 | | return sb.ToString(); |
| | | 303 | | } |
| | | 304 | | |
| | | 305 | | private static string StripInlineTags(string text) |
| | | 306 | | { |
| | 21 | 307 | | var result = ParagraphTag().Replace(text, "$1"); |
| | 21 | 308 | | return AnyTag().Replace(result, "").Trim(); |
| | | 309 | | } |
| | | 310 | | |
| | | 311 | | // ── Paragraphs & Breaks ───────────────────────────────── |
| | | 312 | | |
| | | 313 | | internal static string ConvertParagraphsAndBreaks(string html) |
| | | 314 | | { |
| | 32 | 315 | | var result = ParagraphTag().Replace(html, "$1\n\n"); |
| | 32 | 316 | | result = BreakTag().Replace(result, "\n"); |
| | 32 | 317 | | return HorizontalRule().Replace(result, "\n---\n\n"); |
| | | 318 | | } |
| | | 319 | | |
| | | 320 | | // ── Cleanup ───────────────────────────────────────────── |
| | | 321 | | |
| | | 322 | | internal static string StripRemainingTags(string html) |
| | 30 | 323 | | => AnyTag().Replace(html, ""); |
| | | 324 | | |
| | | 325 | | internal static string NormalizeWhitespace(string text) |
| | 31 | 326 | | => ExcessiveNewlines().Replace(text, "\n\n").Trim(); |
| | | 327 | | } |