String page = HttpGet("www.myXhtmlPage.biz");
StringReader sReader = new StringReader(page);
XmlDocument doc = FromHtml(sReader);
XmlNamespaceManager nsmgr = new XmlNamespaceManager(doc.NameTable);
nsmgr.AddNamespace("games", "http://www.w3.org/1999/xhtml");
XmlNodeList list = doc.SelectNodes("//games:html/games:head/games:title",nsmgr);
String title = list[0].InnerText;
To convert html to Xhtml I recommend the excellent SgmlReader. This is the code to use SGMLReader to convert Html to Xhtml (from the Sgml main page here).
XmlDocument FromHtml(TextReader reader) {
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
sgmlReader.InputStream = reader;
// create document
XmlDocument doc = new XmlDocument();
doc.PreserveWhitespace = true;
doc.XmlResolver = null;
doc.Load(sgmlReader);
return doc;
}
This is the code to get a web page in string format, otherwise known as HTTP Get.
public static string HttpGet(string URI)
{
System.Net.WebRequest req = System.Net.WebRequest.Create(URI);
//req.Proxy = new System.Net.WebProxy("myproxy", true); //true means no proxy
System.Net.WebResponse resp = req.GetResponse();
System.IO.StreamReader sr = new System.IO.StreamReader(resp.GetResponseStream());
return sr.ReadToEnd().Trim();
}
No comments:
Post a Comment