diff options
Diffstat (limited to 'scripts/scraping/scraper.py')
-rw-r--r-- | scripts/scraping/scraper.py | 84 |
1 files changed, 55 insertions, 29 deletions
diff --git a/scripts/scraping/scraper.py b/scripts/scraping/scraper.py index 50b46ae..8919d46 100644 --- a/scripts/scraping/scraper.py +++ b/scripts/scraping/scraper.py @@ -3,33 +3,59 @@ from urllib import parse import json from bs4 import BeautifulSoup +def extractInstructions(instructions): + returnedInstructions = [] + for inst in instructions: + + # If there are sub sections, parse them + if "itemListElement" in inst: + print("hitting sub") + subObj = {} + subInst = [] + subObj["name"] = inst["name"] + for el in inst["itemListElement"]: + subInst.append(el["text"]) + subObj["subInstructions"] = subInst + returnedInstructions.append(subObj) + + # Else we have a shallower array of instructions + else: + returnedInstructions.append(inst["text"]) + + return returnedInstructions + + def scrape(url, user_name): - data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}) - html = BeautifulSoup(data.text, 'html.parser') - inner_html = html.find('script', class_='yoast-schema-graph') - json_data = json.loads(inner_html.contents[0]) - graph_data = json_data["@graph"] - for i in graph_data: - if(i["@type"] == "Recipe"): - recipe = {} - instructions = [] - for instruction in i["recipeInstructions"]: - instructions.append(instruction["text"]) - keywords_list = i["keywords"].split(",") - tags = i["recipeCuisine"] + keywords_list - cleaned_tags = list(set([tag.strip().lower() for tag in tags])) - slug = parse.quote(i["name"]).lower() - - # The recipe - recipe["user"] = user_name - recipe["slug"] = slug - recipe["title"] = i["name"] - recipe["image"] = i["image"][0] - recipe["url"] = i["mainEntityOfPage"] - recipe["tags"] = cleaned_tags - recipe["ingredients"] = i["recipeIngredient"] - recipe["instructions"] = instructions - recipe["visible_by"] = ["jez"] - # recipe["encoded_url"] = urllib.parse.quote(i["name"]) - # Complete this all later!! - return recipe + try: + data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}) + html = BeautifulSoup(data.text, 'html.parser') + inner_html = html.find('script', class_='yoast-schema-graph') + json_data = json.loads(inner_html.contents[0]) + graph_data = json_data["@graph"] + # print(graph_data, "graph data") + for i in graph_data: + if(i["@type"] == "Recipe"): + recipe = {} + instructions = extractInstructions(i["recipeInstructions"]) + keywords_list = i["keywords"].split(",") + tags = i["recipeCuisine"] + keywords_list + cleaned_tags = list(set([tag.strip().lower() for tag in tags])) + slug = parse.quote(i["name"]).lower() + + # The recipe + recipe["user"] = user_name + recipe["slug"] = slug + recipe["title"] = i["name"] + recipe["image"] = i["image"][0] + recipe["url"] = i["mainEntityOfPage"] + recipe["tags"] = cleaned_tags + recipe["ingredients"] = i["recipeIngredient"] + recipe["instructions"] = instructions + recipe["visible_by"] = ["jez"] + # recipe["encoded_url"] = urllib.parse.quote(i["name"]) + + return {"success": True, "data": recipe} + + except Exception as e: + print(str(e)) + return {"success": False, "error": "couldn't scrape site metadata"} |