using System; using System.Text.RegularExpressions; using System.Web; /// /// Parse html data to produce xml content. Result can be exported to DataSet /// Parse HtmlTables. Ignore tables that have nested tables /// public class HtmlToXml { //The next Regex comes from Steve (http://blog.stevenlevithan.com/). Very clever, and exactly what I need protected const string RegExBase = @"<{0}\b[^>]*>(?:(?>[^<]+)|<(?!{0}\b[^>]*>))*?"; Regex regexTable = null; Regex regexTr = null; Regex regexTd = null; public HtmlToXml() { regexTable = new Regex(String.Format(RegExBase, "table"), RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled); regexTr = new Regex(String.Format(RegExBase, "tr"), RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled); regexTd = new Regex(String.Format(RegExBase, "td"), RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled); } protected string CleanCell(string source) { Regex regex = new Regex("<(.|\n)+?>"); source = regex.Replace(source, ""); source = HttpUtility.HtmlDecode(source); return source.Trim(); } protected string ParseCells(string input) { string result = ""; int i = 0; MatchCollection matches = regexTd.Matches(input); //for each cell found foreach (Match match in matches) { i++; string cellText = CleanCell(match.Groups[0].Value); result += string.Format("{1}", i, HttpUtility.HtmlEncode(cellText)); } return result; } protected string ParseRows(string source) { string result = ""; MatchCollection matches = regexTr.Matches(source); //for each row found foreach (Match match in matches) { string cells = ParseCells(match.Groups[0].Value); result += "" + cells + "" + System.Environment.NewLine; } return result; } public string ParseHtml(string source) { string result = ""; int i = 0; MatchCollection matches = regexTable.Matches(source); //foreach table found, capture its content foreach (Match match in matches) { i++; string row = ParseRows(match.Groups[0].Value); if (row.Length > 0) result += string.Format("{1}{2}{1}", i, System.Environment.NewLine, row); } return "" + result + ""; } }