Compare commits

...

2 Commits

Author SHA1 Message Date
Ross Healy
0cd68968d9 Helper methods to clear up parsed text 2024-02-03 22:28:35 +00:00
Ross Healy
fbbb2e9d6f Code to extract drawNumber and drawDate 2024-02-03 22:28:09 +00:00
2 changed files with 81 additions and 0 deletions

View File

@@ -1,5 +1,7 @@
using HtmlAgilityPack;
using lottery_co_uk_scraper.Utilities;
using System.Globalization;
using System.Text.RegularExpressions;
namespace lottery_co_uk_scraper
{
@@ -269,6 +271,47 @@ namespace lottery_co_uk_scraper
}
var metaKeywords = doc.DocumentNode.Descendants("meta")
.FirstOrDefault(x => x.GetAttributeValue("name", "") == "keywords");
if (metaKeywords != null)
{
var keywordsText = metaKeywords.GetAttributeValue("content", "");
var drawNumberMatch = Regex.Match(keywordsText, @"lotto draw (\d+)");
if (drawNumberMatch.Success)
{
var drawNumber = int.Parse(drawNumberMatch.Groups[1].Value);
Console.WriteLine("Draw Number: " + drawNumber);
}
else
{
Console.WriteLine("Draw Number not found.");
}
}
else
{
Console.WriteLine("Meta keywords not found.");
}
var title = doc.DocumentNode.Descendants("title")
.FirstOrDefault();
if (title != null)
{
var titleText = title.InnerText;
var date = TextRemoval.ParseDateString(titleText);
string formattedDate = date.ToString("yyyy-MM-dd");
Console.WriteLine("Draw Date: " + formattedDate);
}
else
{
Console.WriteLine("Title not found.");
}
var rolloverElement = doc.DocumentNode.Descendants("span")
.FirstOrDefault(x => x.InnerText.Trim() == "Rollover");

View File

@@ -0,0 +1,38 @@
using System.Globalization;
using System.Text.RegularExpressions;
namespace lottery_co_uk_scraper.Utilities
{
public class TextRemoval
{
public static string RemoveOrdinalSuffix(string input)
{
return Regex.Replace(input, "(?<=\\d)(st|nd|rd|th)\\b", "");
}
public static DateTime ParseDateString(string dateString)
{
// Define the prefix to be ignored
string prefixToIgnore = "Lotto Results ";
// Check if the input string starts with the specified prefix
if (dateString.StartsWith(prefixToIgnore))
{
// Remove the prefix before attempting to parse the date
dateString = dateString.Substring(prefixToIgnore.Length);
}
// Helper function to remove ordinal suffix from day
string cleanedDateString = RemoveOrdinalSuffix(dateString);
if (DateTime.TryParseExact(cleanedDateString, "dddd d MMMM yyyy", CultureInfo.InvariantCulture, DateTimeStyles.None, out var parsedDate))
{
return parsedDate;
}
else
{
return DateTime.MinValue;
}
}
}
}