skip to Main Content

I’d like to compare between two strings having html tags in different orders.

Example:

string str1="""<p><strong style="font-size: 36px; color: rgb(153, 51, 255);">Hello </strong><em><u>World</u></em></p>""";

string str2="""<p><strong style="color: rgb(153, 51, 255); font-size: 36px;">Hello </strong><em><u>World</u></em></p>""";

I care about the text as well as the style so I want the result of the comparison to be identical because it gives the same text with the same html result. However, a normal string comparison gives that the two strings are different.

how can I compare these two strings using c# not javascript.

2

Answers


  1. to compare the 2 string you have to make few steps:

    1. you need to reorder the attributes of the html tags in both string in the same order.
    2. you need to order the values inside some attributes that takes multiple values like style
    3. trim the text value

    to do that you first need to add nuget package HtmlAgilityPack,
    here is sample code

    class Program
    {
        static void Main()
        {
            string str1 = "<p><strong style="font-size: 36px; color: rgb(153, 51, 255);">Hello </strong><em><u>World</u></em></p>";
            string str2 = "<p><strong style="color: rgb(153, 51, 255); font-size: 36px;">Hello </strong><em><u>World</u></em></p>";
    
            bool areIdentical = AreHtmlStringsIdentical(str1, str2);
            Console.WriteLine($"The HTML strings are identical: {areIdentical}");
        }
    
        static bool AreHtmlStringsIdentical(string html1, string html2)
        {
            var doc1 = new HtmlDocument();
            doc1.LoadHtml(html1);
            var doc2 = new HtmlDocument();
            doc2.LoadHtml(html2);
    
            string canonicalHtml1 = GetCanonicalHtml(doc1.DocumentNode);
            string canonicalHtml2 = GetCanonicalHtml(doc2.DocumentNode);
    
            return canonicalHtml1 == canonicalHtml2;
        }
    
        static string GetCanonicalHtml(HtmlNode node)
        {
            if (node.NodeType == HtmlNodeType.Text)
            {
                return NormalizeWhitespace(node.InnerText);
            }
    
            var builder = new StringBuilder();
            builder.Append('<').Append(node.Name);
    
            var sortedAttributes = node.Attributes.OrderBy(a => a.Name);
            foreach (var attribute in sortedAttributes)
            {
                string value = attribute.Value;
                if (attribute.Name == "style")
                {
                    value = NormalizeStyleAttribute(value);
                }
                builder.Append(' ')
                       .Append(attribute.Name)
                       .Append("="")
                       .Append(NormalizeWhitespace(value))
                       .Append('"');
            }
            builder.Append('>');
    
            foreach (var child in node.ChildNodes)
            {
                builder.Append(GetCanonicalHtml(child));
            }
    
            builder.Append("</").Append(node.Name).Append('>');
    
            return builder.ToString();
        }
    
        static string NormalizeWhitespace(string input)
        {
            return Regex.Replace(input.Trim(), @"s+", " ");
        }
    
        static string NormalizeStyleAttribute(string style)
        {
            var styles = style.Split(';')
                              .Select(s => s.Trim())
                              .Where(s => !string.IsNullOrEmpty(s))
                              .Select(s =>
                              {
                                  var parts = s.Split(':');
                                  return new
                                  {
                                      Name = parts[0].Trim(),
                                      Value = parts.Length > 1 ? NormalizeWhitespace(parts[1].Trim()) : ""
                                  };
                              })
                              .OrderBy(s => s.Name)
                              .Select(s => $"{s.Name}: {s.Value}");
    
            return string.Join("; ", styles) + (styles.Any() ? ";" : "");
        }
    }
    
    Login or Signup to reply.
  2. My approach is not using HtmlAgilityPack, but plain C# and is making use of: System.Xml.Linq. This solution can work for the simple cases that u need:

        using System;
        using System.Linq;
        using System.Xml.Linq;
    
        public class HtmlComparer
        {
            public bool AreEquivalent(string html1, string html2)
            {
                var xElement1 = XElement.Parse(html1);
                var xElement2 = XElement.Parse(html2);
                return XNode.DeepEquals(NormalizeElement(xElement1), NormalizeElement(xElement2));
            }
    
            private XElement NormalizeElement(XElement element)
            {
                return  element.HasElements ?   
                     new XElement(element.Name,element.Attributes().OrderBy(a => a.Name).Select(a => new XAttribute(a.Name, NormalizeAttributeValue(a.Name, a.Value))),
                                        element.Elements().OrderBy(e => e.Name.LocalName).Select(e => NormalizeElement(e))) :       
                     new XElement(element.Name,element.Attributes().OrderBy(a => a.Name).Select(a => new XAttribute(a.Name, NormalizeAttributeValue(a.Name, a.Value))));    
            }
    
            private string NormalizeAttributeValue(XName attributeName, string value)
            {
                if (attributeName != "style")
                    return value;   
    
                var styles = value.Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries).Select(s => s.Trim()).OrderBy(s => s);
                return string.Join("; ", styles);               
            }
        }
    
        public class Program
        {
            public static void Main()
            {
                var cmp = new HtmlComparer();
                string str1=@"<p><strong style=""font-size: 36px; color: rgb(153, 51, 255);"">Hello </strong><em><u>World</u></em></p>";
                string str2=@"<p><strong style=""color: rgb(153, 51, 255); font-size: 36px;"">Hello </strong><em><u>World</u></em></p>";
    
                Console.WriteLine(cmp.AreEquivalent(str1,str2));
            }
        }
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search