Using WebPageGet



Click here for a larger image.

Environment: C#, .NET, Visual Studio.NET

This program was inspired by httptest and another article I did, DNSWatch. In that article, I was looking for DNS names being looked up by browsers. In this program, I retrieve a Web page, display it, and list the hosts referenced in it. There are a lot of diferences in Web pages, especially in how to terminate a line and delineate embedded URLs. There are four main routines. GetHostAndPath is an interesting one. It uses the Uri class (uniform resource identifier) to break out the host and path. GetWebPage is the actual network code, copied from an example and slightly modified. There are several ways to do this send and receive; I just chose this one. GetHosts searches the retrieved page for hosts referenced in it. ChangeLfToCrLf is needed because of how different Web pages are terminated in various ways.

//---------------------------------------------------------------
// This is the main routine for retrieving the contents of a
// Web page and displaying both the page itself plus all
// hostnames embedded in it.
private bool GoGetIt()
{
String host="", path="",  page = "";

  MessageBeep(16);
  String Tstring = m_urls.Text.ToLower();
  if (Tstring.StartsWith("http://") != true)
  m_urls.Text = "http://" + m_urls.Text;
  GetHostAndPath(m_urls.Text, ref host, ref path);
  bool ReturnValue = GetWebPage(host, path,ref page,80);
  m_results.Text = page;
  MessageBeep(32);
  GetHosts(page);
  return ReturnValue;
}
//---------------------------------------------------------------
// This is a lower-level function that uses the TcpClient and
// NetworkStream classes to retrieve a Web page from a remote
// host.
bool GetWebPage(String Host, String Path, ref String Page,
                int Port)
{
NetworkStream ns;
byte[] bytes;
ASCIIEncoding ASCII = new ASCIIEncoding();

  TcpClient tc = new TcpClient();
  try
  {
  tc.Connect(Host, Port);
  ns = tc.GetStream();
  if(ns.CanWrite && ns.CanRead) {
    tc.ReceiveBufferSize = 128000;

    // Create http GET string. Modify for other options
    // as necessary.
    Byte[] sendBytes = Encoding.ASCII.GetBytes
                       (m_http_header.Text);
    ns.Write(sendBytes, 0, sendBytes.Length);

    // Reads the NetworkStream into a byte buffer.
    bytes = new byte[tc.ReceiveBufferSize];
    int numBytesRead = 0;
    int numBytesToRead =  tc.ReceiveBufferSize;

    while (true){
      int n = ns.Read(bytes, numBytesRead, numBytesToRead );
      if (n==0) break;
      numBytesRead += n;
      numBytesToRead -= n;
      Thread.Sleep (1000);    //wait a little, just in case
      if( ns.DataAvailable != true)break;
    }
    tc.Close();
    Page = ASCII.GetString(bytes);
    ChangeLfToCrLf(ref Page);
    }
  }
  catch (Exception except )
  {
  Page = except.ToString();
  tc.Close();
  return false;
  }
  return true;
}
//---------------------------------------------------------------
// This function is needed because some Web pages use <cr>
// and <lf>, some just <lf>.
// Some, such as CNN, has both types in pages as well as
// two sets in a row
void ChangeLfToCrLf ( ref String buf)
{
byte[] abuf = new byte[256000];
int abuf_index = 0;
ASCIIEncoding ASCII = new ASCIIEncoding();

  int crCount = buf.IndexOf("\r",0);
  if (crCount == -1) {    // No carriage returns
    buf = buf.Replace("\x0a", "\x0d\x0a");
    return;
  }
  int bufsize = buf.Length;
  for(int i=0; i<bufsize-1; i++) {
  char ch     = buf[i];
  char nextch = buf[i+1];
  if (ch == '\n') {
    if (nextch != '\r') {
      abuf[abuf_index++] = (byte) '\r';
      abuf[abuf_index++] = (byte) '\n';
    }
    else  abuf[abuf_index++] = (byte) '\n';
  }
  else abuf[abuf_index++] = (byte)  ch;
  }
  buf = ASCII.GetString((byte[]) abuf);
}
//---------------------------------------------------------------
//this procedure uses the URI class (uniform resource identifier)
//to break out the hostname from the rest of the URL
void GetHostAndPath( String uri, ref String Hostname,
                     ref String Path)
{
try {
  Uri siteUri = new Uri(uri );
  Hostname    = siteUri.Host;
  Path        = siteUri.AbsolutePath;
}
catch{}
}
//---------------------------------------------------------------
//this procedure searches the page for "http://" to look for
//host names
public void GetHosts (String page)
{
  try
  {
  String Tstring = page.ToLower();
  m_url_list.Items.Clear();
  int sDx = 0;
  char[] chTerminators = new char[6];
  chTerminators[0] = '/';
  chTerminators[1] = '\'';
  chTerminators[2] = '"';
  chTerminators[3] = ' ';
  chTerminators[4] = '\t';
  chTerminators[5] = '\n';

  while(true) {
    sDx = Tstring.IndexOf("http://",sDx);
    if (sDx == -1) break;
    int nDx   = Tstring.IndexOfAny(chTerminators,sDx+8);
    String ts = Tstring.Substring(sDx, nDx-sDx);
    if (m_url_list.FindString(ts)== ListBox.NoMatches)
    m_url_list.Items.Add(ts);
    sDx = nDx;
  }
}
catch {}    //ignore
}
//---------------------------------------------------------------

Downloads

Download demo project - 29 Kb


Comments

  • There are no comments yet. Be the first to comment!

Leave a Comment
  • Your email address will not be published. All fields are required.

Top White Papers and Webcasts

  • Live Event Date: May 6, 2014 @ 1:00 p.m. ET / 10:00 a.m. PT While you likely have very good reasons for remaining on WinXP after end of support -- an estimated 20-30% of worldwide devices still are -- the bottom line is your security risk is now significant. In the absence of security patches, attackers will certainly turn their attention to this new opportunity. Join Lumension Vice President Paul Zimski in this one-hour webcast to discuss risk and, more importantly, 5 pragmatic risk mitigation techniques …

  • Download the Information Governance Survey Benchmark Report to gain insights that can help you further establish business value in your Records and Information Management (RIM) program and across your entire organization. Discover how your peers in the industry are dealing with this evolving information lifecycle management environment and uncover key insights such as: 87% of organizations surveyed have a RIM program in place 8% measure compliance 64% cannot get employees to "let go" of information for …

Most Popular Programming Stories

More for Developers

Latest Developer Headlines

RSS Feeds