blob: 8919d4635958176c6c6e592357cc4ea2c4a435b7 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
import requests
from urllib import parse
import json
from bs4 import BeautifulSoup
def extractInstructions(instructions):
returnedInstructions = []
for inst in instructions:
# If there are sub sections, parse them
if "itemListElement" in inst:
print("hitting sub")
subObj = {}
subInst = []
subObj["name"] = inst["name"]
for el in inst["itemListElement"]:
subInst.append(el["text"])
subObj["subInstructions"] = subInst
returnedInstructions.append(subObj)
# Else we have a shallower array of instructions
else:
returnedInstructions.append(inst["text"])
return returnedInstructions
def scrape(url, user_name):
try:
data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})
html = BeautifulSoup(data.text, 'html.parser')
inner_html = html.find('script', class_='yoast-schema-graph')
json_data = json.loads(inner_html.contents[0])
graph_data = json_data["@graph"]
# print(graph_data, "graph data")
for i in graph_data:
if(i["@type"] == "Recipe"):
recipe = {}
instructions = extractInstructions(i["recipeInstructions"])
keywords_list = i["keywords"].split(",")
tags = i["recipeCuisine"] + keywords_list
cleaned_tags = list(set([tag.strip().lower() for tag in tags]))
slug = parse.quote(i["name"]).lower()
# The recipe
recipe["user"] = user_name
recipe["slug"] = slug
recipe["title"] = i["name"]
recipe["image"] = i["image"][0]
recipe["url"] = i["mainEntityOfPage"]
recipe["tags"] = cleaned_tags
recipe["ingredients"] = i["recipeIngredient"]
recipe["instructions"] = instructions
recipe["visible_by"] = ["jez"]
# recipe["encoded_url"] = urllib.parse.quote(i["name"])
return {"success": True, "data": recipe}
except Exception as e:
print(str(e))
return {"success": False, "error": "couldn't scrape site metadata"}
|