forschungstage-2023/crawler/crawler.py

import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from w3lib import url as w3url

def crawl_mapping():
    mapping = {}

    class MySpider(scrapy.Spider):
        name = "Crawly"
        allowed_domains = [
            #"minecraft.fandom.com",
            "videospiele.fandom.com"]
        start_urls = [ "https://videospiele.fandom.com/wiki/Videospiele_Wiki" ]
        visits = 0
        lexer = LinkExtractor(allow=r"videospiele\.fandom\.com", deny=([r"\.com/.+/wiki", r"Diskussion:", r"Kategorie:", r"Benutzer:", r"Benutzer_Blog", r"Spezial:"]), restrict_css="main.page__main")
        rules = (
            # Extract links matching 'category.php' (but not matching 'subsection.php')
            # and follow links from them (since no callback means follow=True by default).
            Rule(lexer),
        )

        #mapping = {}

        def parse(self, response):
            self.visits += 1
            print(f"{self.visits} Visiting", response.url)
            #links = response.css("a::attr(href)")
            from_url = response.url
            if from_url not in mapping:
                mapping[from_url] = set()
            for link in self.lexer.extract_links(response):
                url = w3url.url_query_cleaner(link.url)
                mapping[from_url].add(url)
                print(f"{from_url} -> {url}")
                yield scrapy.Request(url, self.parse)

            #for link in self.link_extractor.extract_links(response.css("a::attr(href)")):
            #yield scrapy.Request(link.url, callback=self.parse)
            #for link in links:
            #    #print(link)
            #    yield response.follow(link, self.parse)

    class BooksSpider(scrapy.Spider):
        name = "Books"

        allowed_domains = ["books.toscrape.com"]
        start_urls = ["http://books.toscrape.com/"]

        def parse(self, response):
            if (response.url.startswith("http://books.toscrape.com/catalogue")):
                for book in response.css("article.product_pod h3"):
                    title = book.css("a::text").get()
                    yield { "title": title }

            yield from response.follow_all(response.css("li.next a::attr(href)"), self.parse)
            #yield from response.follow_all(next_pages, self.parse)


    class QuotesSpider(scrapy.Spider):
        name = "quotes"

        def start_requests(self):
            url = "https://quotes.toscrape.com/"
            tag = getattr(self, "tag", None)
            if tag is not None:
                url = url + "tag/" + tag
            yield scrapy.Request(url, self.parse)

        def parse(self, response):
            for quote in response.css("div.quote"):
                yield {
                    "text": quote.css("span.text::text").get(),
                    "author": quote.css("small.author::text").get(),
                }

            next_pages = response.css("li.next a::attr(href)")
            yield from response.follow_all(next_pages, self.parse)
            #next_page = response.css("li.next a::attr(href)").get()
            #if next_page is not None:
            #    yield response.follow(next_page, self.parse)

    process = CrawlerProcess(
        settings={
            "FEEDS": {
                "items.json": {"format": "json"},
            },
        }
    )

    process.crawl(MySpider)
    process.start()  # the script will block here until the crawling is finished
    return mapping

mapping = crawl_mapping()
mapping_list = { k: list(v) for (k, v) in mapping.items()}
import json
rep = json.dumps(mapping_list)
with open("videospiele.json", "w") as f:
    f.writelines(rep)