import scrapy from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess from w3lib import url as w3url def crawl_mapping(): mapping = {} class MySpider(scrapy.Spider): name = "Crawly" allowed_domains = [ #"minecraft.fandom.com", "videospiele.fandom.com"] start_urls = [ "https://videospiele.fandom.com/wiki/Videospiele_Wiki" ] visits = 0 lexer = LinkExtractor(allow=r"videospiele\.fandom\.com", deny=([r"\.com/.+/wiki", r"Diskussion:", r"Kategorie:", r"Benutzer:", r"Benutzer_Blog", r"Spezial:"]), restrict_css="main.page__main") rules = ( # Extract links matching 'category.php' (but not matching 'subsection.php') # and follow links from them (since no callback means follow=True by default). Rule(lexer), ) #mapping = {} def parse(self, response): self.visits += 1 print(f"{self.visits} Visiting", response.url) #links = response.css("a::attr(href)") from_url = response.url if from_url not in mapping: mapping[from_url] = set() for link in self.lexer.extract_links(response): url = w3url.url_query_cleaner(link.url) mapping[from_url].add(url) print(f"{from_url} -> {url}") yield scrapy.Request(url, self.parse) #for link in self.link_extractor.extract_links(response.css("a::attr(href)")): #yield scrapy.Request(link.url, callback=self.parse) #for link in links: # #print(link) # yield response.follow(link, self.parse) class BooksSpider(scrapy.Spider): name = "Books" allowed_domains = ["books.toscrape.com"] start_urls = ["http://books.toscrape.com/"] def parse(self, response): if (response.url.startswith("http://books.toscrape.com/catalogue")): for book in response.css("article.product_pod h3"): title = book.css("a::text").get() yield { "title": title } yield from response.follow_all(response.css("li.next a::attr(href)"), self.parse) #yield from response.follow_all(next_pages, self.parse) class QuotesSpider(scrapy.Spider): name = "quotes" def start_requests(self): url = "https://quotes.toscrape.com/" tag = getattr(self, "tag", None) if tag is not None: url = url + "tag/" + tag yield scrapy.Request(url, self.parse) def parse(self, response): for quote in response.css("div.quote"): yield { "text": quote.css("span.text::text").get(), "author": quote.css("small.author::text").get(), } next_pages = response.css("li.next a::attr(href)") yield from response.follow_all(next_pages, self.parse) #next_page = response.css("li.next a::attr(href)").get() #if next_page is not None: # yield response.follow(next_page, self.parse) process = CrawlerProcess( settings={ "FEEDS": { "items.json": {"format": "json"}, }, } ) process.crawl(MySpider) process.start() # the script will block here until the crawling is finished return mapping mapping = crawl_mapping() mapping_list = { k: list(v) for (k, v) in mapping.items()} import json rep = json.dumps(mapping_list) with open("videospiele.json", "w") as f: f.writelines(rep)