Here is a nice article from Oxylab Web Scraping With C#
You can try this code in VSCODE :
using System.Globalization;
using CsvHelper;
using HtmlAgilityPack;
namespace webscraping
{
public class Book
{
public string? Title { get; set; }
public string? Price { get; set; }
}
class Program
{
static HtmlDocument GetDocument(string url)
{
HtmlWeb web = new HtmlWeb();
HtmlDocument doc = web.Load(url);
return doc;
}
static List<string> GetBookLinks(string url)
{
var bookLinks = new List<string>();
HtmlDocument doc = GetDocument(url);
HtmlNodeCollection linkNodes = doc.DocumentNode.SelectNodes("//h3/a");
var baseUri = new Uri(url);
foreach (var link in linkNodes)
{
string href = link.Attributes["href"].Value;
bookLinks.Add(new Uri(baseUri, href).AbsoluteUri);
}
return bookLinks;
}
static List<Book> GetBookDetails(List<string> urls)
{
var books = new List<Book>();
foreach (var url in urls)
{
HtmlDocument document = GetDocument(url);
var titleXPath = "//h1";
var priceXPath = "//div[contains(@class,\"product_main\")]/p[@class=\"price_color\"]";
var book = new Book();
book.Title = document.DocumentNode.SelectSingleNode(titleXPath).InnerText;
book.Price = document.DocumentNode.SelectSingleNode(priceXPath).InnerText;
books.Add(book);
}
return books;
}
static void exportToCSV(List<Book> books)
{
using (var writer = new StreamWriter("books.csv"))
using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
{
csv.WriteRecords(books);
}
}
static void Main(string[] args)
{
var bookLinks = GetBookLinks("http://books.toscrape.com/catalogue/category/books/mystery_3/index.html");
Console.WriteLine("Found {0} links", bookLinks.Count);
var books = GetBookDetails(bookLinks);
exportToCSV(books);
}
}
}
For scrapping dynamic web sites, you can use Selenium, as described here:
using System.Globalization;
using CsvHelper;
using OpenQA.Selenium;
using OpenQA.Selenium.Firefox;
using WebDriverManager;
using WebDriverManager.DriverConfigs.Impl;
namespace webscraping
{
public class Quote
{
public string? Text { get; set; }
public string? Author { get; set; }
public override string ToString()
{
return Author + " says, " + Text;
}
}
public class Program
{
static void Main(string[] args)
{
new DriverManager().SetUpDriver(new FirefoxConfig());
FirefoxOptions options = new FirefoxOptions();
options.AddArgument("--headless");
var driver = new FirefoxDriver(options);
driver.Navigate().GoToUrl("http://quotes.toscrape.com/js/");
var quotes = new List<Quote>();
var quoteContainers = driver.FindElements(By.CssSelector("div.quote"));
foreach (var item in quoteContainers)
{
Quote quote = new()
{
Text = item.FindElement(By.CssSelector("span.text")).Text,
Author = item.FindElement(By.CssSelector(".author")).Text
};
quotes.Add(quote);
Console.WriteLine(quote.ToString());
}
using (var writer = new StreamWriter("c:\\temp\\quotes.csv"))
using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
{
try
{
csv.WriteRecords(quotes);
csv.Flush();
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString());
}
}
var img = driver.GetFullPageScreenshot();
// Close the driver
driver.Quit();
}
}
}
Top comments (0)