We can find a lot of code in C# that will parse the http urls in given string. But it is difficult to find a code that will: Accept a url as argument, parse the site content Fetch all urls in the site content,Parse the site content of each urls Repeat the above process until all urls are fetched
This is an action button to fine URL and add in the textbox.
protected void LnkSearchURL_Click(object sender, EventArgs e)
{
InputURLResult.Text = string.Empty;
GetUrlFromWebsite spiderLogic = new GetUrlFromWebsite();
var list = spiderLogic.GetUrls(InputURL.Text.Trim(' '), false);
foreach (var item in list)
InputURLResult.Text += "URL : " + item.ToString() + Environment.NewLine;
}
This is business logic to get URL from website.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
namespace CsharpCode.BusinessPart
public class GetUrlFromWebsite
/// <summary>
/// Returns the urls in specified site address
/// </summary>
/// <param name="baseUrl">Base Url</param>
/// <param name="recursive">If true, parses recursively through all links</param>
/// <returns></returns>
public IList<string> GetUrls(string url, bool recursive)
string absoluteBaseUrl = url;
if (!absoluteBaseUrl.EndsWith("/"))
absoluteBaseUrl += "/";
return this.GetUrls(url, absoluteBaseUrl, recursive);
/// <param name="url">Base Url</param>
public IList<string> GetUrls(string url, string baseUrl, bool recursive)
if (recursive)
_urls.Clear();
RecursivelyGenerateUrls(url, baseUrl);
return _urls;
else
return InternalGetUrls(url, baseUrl);
/// Internal method that recursively generates urls
/// <param name="baseUrl"></param>
/// <param name="absoluteBaseUrl"></param>
private void RecursivelyGenerateUrls(string baseUrl, string absoluteBaseUrl)
var urls = InternalGetUrls(baseUrl, absoluteBaseUrl);
foreach (string url in urls)
if (!_urls.Contains(url))
_urls.Add(url);
string newAbsoluteBaseUrl = GetBasePath(url);
RecursivelyGenerateUrls(url, newAbsoluteBaseUrl);
private string GetBasePath(string baseUrl)
if (baseUrl.EndsWith("/"))
baseUrl = baseUrl.Substring(0, baseUrl.Length - 1);
if (baseUrl.Contains("/"))
int index = baseUrl.LastIndexOf("/");
string basePath = baseUrl.Substring(0, index + 1);
if (!basePath.EndsWith("/"))
basePath += "/";
return basePath;
return baseUrl;
private IList<string> _urls = new List<string>();
private IList<string> InternalGetUrls(string baseUrl, string absoluteBaseUrl)
IList<string> list = new List<string>();
Uri uri = null;
if (!Uri.TryCreate(baseUrl, UriKind.RelativeOrAbsolute, out uri))
return list;
// Get the http content
string siteContent = GetHttpResponse(baseUrl);
var allUrls = GetAllUrls(siteContent);
foreach (string uriString in allUrls)
uri = null;
if (Uri.TryCreate(uriString, UriKind.RelativeOrAbsolute, out uri))
if (uri.IsAbsoluteUri)
if (uri.OriginalString.StartsWith(absoluteBaseUrl)) // If different domain / javascript: urls needed exclude this check
list.Add(uriString);
string newUri = GetAbsoluteUri(uri, absoluteBaseUrl, uriString);
if (!string.IsNullOrEmpty(newUri))
list.Add(newUri);
if (!uriString.StartsWith(absoluteBaseUrl))
private string GetAbsoluteUri(Uri uri, string basePath, string uriString)
if (!string.IsNullOrEmpty(uriString))
if (uriString.Contains(":"))
if (!uriString.Contains("http:"))
return string.Empty;
basePath = GetResolvedBasePath(basePath, uriString);
uriString = uriString.Replace("../", string.Empty);
string newUriString = basePath;
if (!newUriString.EndsWith("/"))
newUriString += "/";
newUriString += uriString;
newUriString = newUriString.Replace("//", "/");
if (Uri.TryCreate(newUriString, UriKind.RelativeOrAbsolute, out uri))
return newUriString;
private string GetResolvedBasePath(string basePath, string uriString)
int count = GetCountOf("../", uriString);
for (int i = 1; i <= count; i++)
basePath = GetBasePath(basePath);
private int GetCountOf(string pattern, string str)
int count = 0;
int index = -1;
while (true)
index = str.IndexOf(pattern, index + 1);
if (index == -1)
break;
count++;
return count;
/// Returns all urls in string content
/// [Includes javascrip:, mailto:, other domains too]
/// <param name="str"></param>
private string[] GetAllUrls(string str)
string pattern = @"<a.*?href=[""'](?<url>.*?)[""'].*?>(?<name>.*?)</a>";
System.Text.RegularExpressions.MatchCollection matches = System.Text.RegularExpressions.Regex.Matches(str, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
string[] matchList = new string[matches.Count];
int c = 0;
foreach (System.Text.RegularExpressions.Match match in matches)
matchList[c++] = match.Groups["url"].Value;
return matchList;
/// Returns the response content as string for given url
/// <param name="url"></param>
private string GetHttpResponse(string url)
try
ASCIIEncoding encoding = new ASCIIEncoding();
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
myRequest.Method = "GET";
HttpWebResponse response = (HttpWebResponse)myRequest.GetResponse();
return GetResponseContent(response);
catch (Exception ex)
HandleException(ex);
return String.Empty;
#region "Exception Handling"
public delegate void OnExceptionDelegate(Exception ex);
/// OnException delegate can be used to handle the exceptions inside this class
public OnExceptionDelegate OnException;
private void HandleException(Exception ex)
if (OnException != null)
OnException(ex);
#endregion
/// Returns the string content of HttpWebResponse
/// <param name="response"></param>
private string GetResponseContent(HttpWebResponse response)
if (response == null)
StringBuilder builder = new StringBuilder();
Stream stream = response.GetResponseStream();
StreamReader streamReader = new StreamReader(stream);
int data = 0;
do
data = streamReader.Read();
if (data > -1)
builder.Append((char)data);
while (data > -1);
streamReader.Close();
return builder.ToString();