I’ve found myself having to convert HTML to plain text a few times in a row now and thought I would post my very simple solution for it.

It’s got a few caveats, for example, it won’t handle <pre> tags or margins on divs and such thing. Not even paragraphs. But I just wanted a simple conversion so it was enough for me, feel free to extend it. You can try it out using this dotnetfiddle.

public static class Html2PlainText {
    private static readonly Regex NonExplicitLines = new Regex ("\r|\n", RegexOptions.Multiline | RegexOptions.Compiled);
    private static readonly Regex DivEndings = new Regex ("</div>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex LineBreaks = new Regex ("</br\s*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    private static readonly Regex Tags = new Regex ("<[^>]*>", RegexOptions.Compiled);

    public static string Decode (string html) {
        if (string.IsNullOrEmpty (html))
            return html;

        var decoded = html.Trim ();
        if (!HasTags (decoded))
            return html;

        decoded = NonExplicitLines.Replace (decoded, string.Empty);
        decoded = DivEndings.Replace (decoded, Environment.NewLine);
        decoded = LineBreaks.Replace (decoded, Environment.NewLine);
        decoded = Tags.Replace (decoded, string.Empty).Trim ();

        return WebUtility.HtmlDecode (decoded);
    }

    private static bool HasTags (string str) {
        return str.StartsWith ("<") && str.EndsWith (">");
    }
}