using System;
using System.Text.RegularExpressions;
using System.Web;
///
/// Parse html data to produce xml content. Result can be exported to DataSet
/// Parse HtmlTables. Ignore tables that have nested tables
///
public class HtmlToXml
{
//The next Regex comes from Steve (http://blog.stevenlevithan.com/). Very clever, and exactly what I need
protected const string RegExBase = @"<{0}\b[^>]*>(?:(?>[^<]+)|<(?!{0}\b[^>]*>))*?{0}>";
Regex regexTable = null;
Regex regexTr = null;
Regex regexTd = null;
public HtmlToXml()
{
regexTable = new Regex(String.Format(RegExBase, "table"), RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled);
regexTr = new Regex(String.Format(RegExBase, "tr"), RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled);
regexTd = new Regex(String.Format(RegExBase, "td"), RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled);
}
protected string CleanCell(string source)
{
Regex regex = new Regex("<(.|\n)+?>");
source = regex.Replace(source, "");
source = HttpUtility.HtmlDecode(source);
return source.Trim();
}
protected string ParseCells(string input)
{
string result = "";
int i = 0;
MatchCollection matches = regexTd.Matches(input);
//for each cell found
foreach (Match match in matches)
{
i++;
string cellText = CleanCell(match.Groups[0].Value);
result += string.Format("{1}", i, HttpUtility.HtmlEncode(cellText));
}
return result;
}
protected string ParseRows(string source)
{
string result = "";
MatchCollection matches = regexTr.Matches(source);
//for each row found
foreach (Match match in matches)
{
string cells = ParseCells(match.Groups[0].Value);
result += "" + cells + "
" + System.Environment.NewLine;
}
return result;
}
public string ParseHtml(string source)
{
string result = "";
int i = 0;
MatchCollection matches = regexTable.Matches(source);
//foreach table found, capture its content
foreach (Match match in matches)
{
i++;
string row = ParseRows(match.Groups[0].Value);
if (row.Length > 0)
result += string.Format("
{1}", i, System.Environment.NewLine, row);
}
return "" + result + "";
}
}