aboutsummaryrefslogtreecommitdiff
path: root/scripts/scraping/scraper.py
blob: 50b46ae7cbf4bc90b9c8597be45d8189f9c5f26a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from urllib import parse
import json
from bs4 import BeautifulSoup

def scrape(url, user_name):
    data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})    
    html = BeautifulSoup(data.text, 'html.parser')
    inner_html = html.find('script', class_='yoast-schema-graph')
    json_data = json.loads(inner_html.contents[0])
    graph_data = json_data["@graph"]
    for i in graph_data:
        if(i["@type"] == "Recipe"):
            recipe = {}
            instructions = []
            for instruction in i["recipeInstructions"]:
                instructions.append(instruction["text"])
            keywords_list = i["keywords"].split(",")
            tags = i["recipeCuisine"] + keywords_list
            cleaned_tags = list(set([tag.strip().lower() for tag in tags]))
            slug = parse.quote(i["name"]).lower()

            # The recipe
            recipe["user"] = user_name
            recipe["slug"] = slug
            recipe["title"] = i["name"]
            recipe["image"] = i["image"][0]
            recipe["url"] = i["mainEntityOfPage"]
            recipe["tags"] = cleaned_tags
            recipe["ingredients"] = i["recipeIngredient"]
            recipe["instructions"] = instructions
            recipe["visible_by"] = ["jez"]
            # recipe["encoded_url"] = urllib.parse.quote(i["name"])
            # Complete this all later!!
    return recipe