Compare commits

...

2 Commits

Author SHA1 Message Date
Dominic Zimmer
43c2a1a326 Update text-gen 2023-06-14 11:00:25 +02:00
Dominic Zimmer
642c24108e Implement crawler 2023-06-14 11:00:06 +02:00
7 changed files with 13492 additions and 1 deletions

101
crawler/crawler.py Normal file
View File

@ -0,0 +1,101 @@
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from w3lib import url as w3url
def crawl_mapping():
mapping = {}
class MySpider(scrapy.Spider):
name = "Crawly"
allowed_domains = [
#"minecraft.fandom.com",
"videospiele.fandom.com"]
start_urls = [ "https://videospiele.fandom.com/wiki/Videospiele_Wiki" ]
visits = 0
lexer = LinkExtractor(allow=r"videospiele\.fandom\.com", deny=([r"\.com/.+/wiki", r"Diskussion:", r"Kategorie:", r"Benutzer:", r"Benutzer_Blog", r"Spezial:"]), restrict_css="main.page__main")
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(lexer),
)
#mapping = {}
def parse(self, response):
self.visits += 1
print(f"{self.visits} Visiting", response.url)
#links = response.css("a::attr(href)")
from_url = response.url
if from_url not in mapping:
mapping[from_url] = set()
for link in self.lexer.extract_links(response):
url = w3url.url_query_cleaner(link.url)
mapping[from_url].add(url)
print(f"{from_url} -> {url}")
yield scrapy.Request(url, self.parse)
#for link in self.link_extractor.extract_links(response.css("a::attr(href)")):
#yield scrapy.Request(link.url, callback=self.parse)
#for link in links:
# #print(link)
# yield response.follow(link, self.parse)
class BooksSpider(scrapy.Spider):
name = "Books"
allowed_domains = ["books.toscrape.com"]
start_urls = ["http://books.toscrape.com/"]
def parse(self, response):
if (response.url.startswith("http://books.toscrape.com/catalogue")):
for book in response.css("article.product_pod h3"):
title = book.css("a::text").get()
yield { "title": title }
yield from response.follow_all(response.css("li.next a::attr(href)"), self.parse)
#yield from response.follow_all(next_pages, self.parse)
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
url = "https://quotes.toscrape.com/"
tag = getattr(self, "tag", None)
if tag is not None:
url = url + "tag/" + tag
yield scrapy.Request(url, self.parse)
def parse(self, response):
for quote in response.css("div.quote"):
yield {
"text": quote.css("span.text::text").get(),
"author": quote.css("small.author::text").get(),
}
next_pages = response.css("li.next a::attr(href)")
yield from response.follow_all(next_pages, self.parse)
#next_page = response.css("li.next a::attr(href)").get()
#if next_page is not None:
# yield response.follow(next_page, self.parse)
process = CrawlerProcess(
settings={
"FEEDS": {
"items.json": {"format": "json"},
},
}
)
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished
return mapping
mapping = crawl_mapping()
mapping_list = { k: list(v) for (k, v) in mapping.items()}
import json
rep = json.dumps(mapping_list)
with open("videospiele.json", "w") as f:
f.writelines(rep)

982
crawler/items.json Normal file
View File

@ -0,0 +1,982 @@
[
{"title": "In Her Wake"},
{"title": "How Music Works"},
{"title": "Foolproof Preserving: A Guide ..."},
{"title": "Chase Me (Paris Nights ..."},
{"title": "Black Dust"},
{"title": "Birdsong: A Story in ..."},
{"title": "America's Cradle of Quarterbacks: ..."},
{"title": "Aladdin and His Wonderful ..."},
{"title": "Worlds Elsewhere: Journeys Around ..."},
{"title": "Wall and Piece"},
{"title": "The Four Agreements: A ..."},
{"title": "The Five Love Languages: ..."},
{"title": "The Elephant Tree"},
{"title": "The Bear and the ..."},
{"title": "Sophie's World"},
{"title": "Penny Maybe"},
{"title": "Maude (1883-1993):She Grew Up ..."},
{"title": "In a Dark, Dark ..."},
{"title": "Behind Closed Doors"},
{"title": "You can't bury them ..."},
{"title": "Slow States of Collapse: ..."},
{"title": "Reasons to Stay Alive"},
{"title": "Private Paris (Private #10)"},
{"title": "#HigherSelfie: Wake Up Your ..."},
{"title": "Without Borders (Wanderlove #1)"},
{"title": "When We Collided"},
{"title": "We Love You, Charlie ..."},
{"title": "Untitled Collection: Sabbath Poems ..."},
{"title": "Unseen City: The Majesty ..."},
{"title": "Unicorn Tracks"},
{"title": "Unbound: How Eight Technologies ..."},
{"title": "Tsubasa: WoRLD CHRoNiCLE 2 ..."},
{"title": "Throwing Rocks at the ..."},
{"title": "This One Summer"},
{"title": "Thirst"},
{"title": "The Torch Is Passed: ..."},
{"title": "The Secret of Dreadwillow ..."},
{"title": "The Pioneer Woman Cooks: ..."},
{"title": "The Past Never Ends"},
{"title": "The Natural History of ..."},
{"title": "The Nameless City (The ..."},
{"title": "The Murder That Never ..."},
{"title": "The Most Perfect Thing: ..."},
{"title": "The Mindfulness and Acceptance ..."},
{"title": "The Life-Changing Magic of ..."},
{"title": "The Inefficiency Assassin: Time ..."},
{"title": "The Gutsy Girl: Escapades ..."},
{"title": "The Electric Pencil: Drawings ..."},
{"title": "The Death of Humanity: ..."},
{"title": "The Bulletproof Diet: Lose ..."},
{"title": "The Art Forger"},
{"title": "The Age of Genius: ..."},
{"title": "The Activist's Tao Te ..."},
{"title": "Spark Joy: An Illustrated ..."},
{"title": "Soul Reader"},
{"title": "Security"},
{"title": "Saga, Volume 6 (Saga ..."},
{"title": "Saga, Volume 5 (Saga ..."},
{"title": "Reskilling America: Learning to ..."},
{"title": "Rat Queens, Vol. 3: ..."},
{"title": "Princess Jellyfish 2-in-1 Omnibus, ..."},
{"title": "Princess Between Worlds (Wide-Awake ..."},
{"title": "Pop Gun War, Volume ..."},
{"title": "Political Suicide: Missteps, Peccadilloes, ..."},
{"title": "Patience"},
{"title": "Outcast, Vol. 1: A ..."},
{"title": "orange: The Complete Collection ..."},
{"title": "Online Marketing for Busy ..."},
{"title": "On a Midnight Clear"},
{"title": "Obsidian (Lux #1)"},
{"title": "My Paris Kitchen: Recipes ..."},
{"title": "Masks and Shadows"},
{"title": "Mama Tried: Traditional Italian ..."},
{"title": "Lumberjanes, Vol. 2: Friendship ..."},
{"title": "Lumberjanes, Vol. 1: Beware ..."},
{"title": "Lumberjanes Vol. 3: A ..."},
{"title": "Layered: Baking, Building, and ..."},
{"title": "Judo: Seven Steps to ..."},
{"title": "Join"},
{"title": "In the Country We ..."},
{"title": "Immunity: How Elie Metchnikoff ..."},
{"title": "I Hate Fairyland, Vol. ..."},
{"title": "I am a Hero ..."},
{"title": "How to Be Miserable: ..."},
{"title": "Her Backup Boyfriend (The ..."},
{"title": "Giant Days, Vol. 2 ..."},
{"title": "Forever and Forever: The ..."},
{"title": "First and First (Five ..."},
{"title": "Fifty Shades Darker (Fifty ..."},
{"title": "Everydata: The Misinformation Hidden ..."},
{"title": "Don't Be a Jerk: ..."},
{"title": "Danganronpa Volume 1"},
{"title": "Crown of Midnight (Throne ..."},
{"title": "Codename Baboushka, Volume 1: ..."},
{"title": "Camp Midnight"},
{"title": "Call the Nurse: True ..."},
{"title": "Burning"},
{"title": "Bossypants"},
{"title": "Bitch Planet, Vol. 1: ..."},
{"title": "Avatar: The Last Airbender: ..."},
{"title": "Algorithms to Live By: ..."},
{"title": "A World of Flavor: ..."},
{"title": "A Piece of Sky, ..."},
{"title": "A Murder in Time"},
{"title": "A Flight of Arrows ..."},
{"title": "A Fierce and Subtle ..."},
{"title": "A Court of Thorns ..."},
{"title": "(Un)Qualified: How God Uses ..."},
{"title": "You Are What You ..."},
{"title": "William Shakespeare's Star Wars: ..."},
{"title": "Tuesday Nights in 1980"},
{"title": "Tracing Numbers on a ..."},
{"title": "Throne of Glass (Throne ..."},
{"title": "Thomas Jefferson and the ..."},
{"title": "Thirteen Reasons Why"},
{"title": "The White Cat and ..."},
{"title": "The Wedding Dress"},
{"title": "The Vacationers"},
{"title": "The Third Wave: An ..."},
{"title": "The Stranger"},
{"title": "The Shadow Hero (The ..."},
{"title": "The Secret (The Secret ..."},
{"title": "The Regional Office Is ..."},
{"title": "The Psychopath Test: A ..."},
{"title": "The Project"},
{"title": "The Power of Now: ..."},
{"title": "The Omnivore's Dilemma: A ..."},
{"title": "The Nerdy Nummies Cookbook: ..."},
{"title": "The Murder of Roger ..."},
{"title": "The Mistake (Off-Campus #2)"},
{"title": "The Matchmaker's Playbook (Wingmen ..."},
{"title": "The Love and Lemons ..."},
{"title": "The Long Shadow of ..."},
{"title": "The Kite Runner"},
{"title": "The House by the ..."},
{"title": "The Glittering Court (The ..."},
{"title": "The Girl on the ..."},
{"title": "The Genius of Birds"},
{"title": "The Emerald Mystery"},
{"title": "The Cookies & Cups ..."},
{"title": "The Bridge to Consciousness: ..."},
{"title": "The Artist's Way: A ..."},
{"title": "The Art of War"},
{"title": "The Argonauts"},
{"title": "The 10% Entrepreneur: Live ..."},
{"title": "Suddenly in Love (Lake ..."},
{"title": "Something More Than This"},
{"title": "Soft Apocalypse"},
{"title": "So You've Been Publicly ..."},
{"title": "Shoe Dog: A Memoir ..."},
{"title": "Shobu Samurai, Project Aryoku ..."},
{"title": "Secrets and Lace (Fatal ..."},
{"title": "Scarlett Epstein Hates It ..."},
{"title": "Romero and Juliet: A ..."},
{"title": "Redeeming Love"},
{"title": "Poses for Artists Volume ..."},
{"title": "Poems That Make Grown ..."},
{"title": "Nightingale, Sing"},
{"title": "Night Sky with Exit ..."},
{"title": "Mrs. Houdini"},
{"title": "Modern Romance"},
{"title": "Miss Peregrine\u2019s Home for ..."},
{"title": "Louisa: The Extraordinary Life ..."},
{"title": "Little Red"},
{"title": "Library of Souls (Miss ..."},
{"title": "Large Print Heart of ..."},
{"title": "I Had a Nice ..."},
{"title": "Hollow City (Miss Peregrine\u2019s ..."},
{"title": "Grumbles"},
{"title": "Full Moon over Noah\u2019s ..."},
{"title": "Frostbite (Vampire Academy #2)"},
{"title": "Follow You Home"},
{"title": "First Steps for New ..."},
{"title": "Finders Keepers (Bill Hodges ..."},
{"title": "Fables, Vol. 1: Legends ..."},
{"title": "Eureka Trivia 6.0"},
{"title": "Drive: The Surprising Truth ..."},
{"title": "Done Rubbed Out (Reightman ..."},
{"title": "Doing It Over (Most ..."},
{"title": "Deliciously Ella Every Day: ..."},
{"title": "Dark Notes"},
{"title": "Daring Greatly: How the ..."},
{"title": "Close to You"},
{"title": "Chasing Heaven: What Dying ..."},
{"title": "Big Magic: Creative Living ..."},
{"title": "Becoming Wise: An Inquiry ..."},
{"title": "Beauty Restored (Riley Family ..."},
{"title": "Batman: The Long Halloween ..."},
{"title": "Batman: The Dark Knight ..."},
{"title": "Ayumi's Violin"},
{"title": "Anonymous"},
{"title": "Amy Meets the Saints ..."},
{"title": "Amid the Chaos"},
{"title": "Amatus"},
{"title": "Agnostic: A Spirited Manifesto"},
{"title": "Zealot: The Life and ..."},
{"title": "You (You #1)"},
{"title": "Wonder Woman: Earth One, ..."},
{"title": "Wild Swans"},
{"title": "Why the Right Went ..."},
{"title": "Whole Lotta Creativity Going ..."},
{"title": "What's It Like in ..."},
{"title": "We Are Robin, Vol. ..."},
{"title": "Walt Disney's Alice in ..."},
{"title": "V for Vendetta (V ..."},
{"title": "Until Friday Night (The ..."},
{"title": "Unbroken: A World War ..."},
{"title": "Twenty Yawns"},
{"title": "Through the Woods"},
{"title": "This Is Where It ..."},
{"title": "The Year of Magical ..."},
{"title": "The Wright Brothers"},
{"title": "The White Queen (The ..."},
{"title": "The Wedding Pact (The ..."},
{"title": "The Time Keeper"},
{"title": "The Testament of Mary"},
{"title": "The Star-Touched Queen"},
{"title": "The Songs of the ..."},
{"title": "The Song of Achilles"},
{"title": "The Rosie Project (Don ..."},
{"title": "The Power of Habit: ..."},
{"title": "The Marriage of Opposites"},
{"title": "The Lucifer Effect: Understanding ..."},
{"title": "The Long Haul (Diary ..."},
{"title": "The Loney"},
{"title": "The Literature Book (Big ..."},
{"title": "The Last Mile (Amos ..."},
{"title": "The Immortal Life of ..."},
{"title": "The Hidden Oracle (The ..."},
{"title": "The Help Yourself Cookbook ..."},
{"title": "The Guilty (Will Robie ..."},
{"title": "The First Hostage (J.B. ..."},
{"title": "The Dovekeepers"},
{"title": "The Darkest Lie"},
{"title": "The Bane Chronicles (The ..."},
{"title": "The Bad-Ass Librarians of ..."},
{"title": "The 14th Colony (Cotton ..."},
{"title": "That Darkness (Gardiner and ..."},
{"title": "Tastes Like Fear (DI ..."},
{"title": "Take Me with You"},
{"title": "Swell: A Year of ..."},
{"title": "Superman Vol. 1: Before ..."},
{"title": "Still Life with Bread ..."},
{"title": "Steve Jobs"},
{"title": "Sorting the Beef from ..."},
{"title": "Someone Like You (The ..."},
{"title": "So Cute It Hurts!!, ..."},
{"title": "Shtum"},
{"title": "See America: A Celebration ..."},
{"title": "salt."},
{"title": "Robin War"},
{"title": "Red Hood/Arsenal, Vol. 1: ..."},
{"title": "Rain Fish"},
{"title": "Quarter Life Poetry: Poems ..."},
{"title": "Pet Sematary"},
{"title": "Overload: How to Unplug, ..."},
{"title": "Once Was a Time"},
{"title": "Old School (Diary of ..."},
{"title": "No Dream Is Too ..."},
{"title": "Naruto (3-in-1 Edition), Vol. ..."},
{"title": "My Name Is Lucy ..."},
{"title": "My Mrs. Brown"},
{"title": "My Kind of Crazy"},
{"title": "Mr. Mercedes (Bill Hodges ..."},
{"title": "More Than Music (Chasing ..."},
{"title": "Made to Stick: Why ..."},
{"title": "Luis Paints the World"},
{"title": "Luckiest Girl Alive"},
{"title": "Lowriders to the Center ..."},
{"title": "Love Is a Mix ..."},
{"title": "Looking for Lovely: Collecting ..."},
{"title": "Living Leadership by Insight: ..."},
{"title": "Let It Out: A ..."},
{"title": "Lady Midnight (The Dark ..."},
{"title": "It's All Easy: Healthy, ..."},
{"title": "Island of Dragons (Unwanteds ..."},
{"title": "I Know What I'm ..."},
{"title": "I Am Pilgrim (Pilgrim ..."},
{"title": "Hyperbole and a Half: ..."},
{"title": "Hush, Hush (Hush, Hush ..."},
{"title": "Hold Your Breath (Search ..."},
{"title": "Hamilton: The Revolution"},
{"title": "Greek Mythic History"},
{"title": "God: The Most Unpleasant ..."},
{"title": "Glory over Everything: Beyond ..."},
{"title": "Feathers: Displays of Brilliant ..."},
{"title": "Far & Away: Places ..."},
{"title": "Every Last Word"},
{"title": "Eligible (The Austen Project ..."},
{"title": "El Deafo"},
{"title": "Eight Hundred Grapes"},
{"title": "Eaternity: More than 150 ..."},
{"title": "Eat Fat, Get Thin"},
{"title": "Don't Get Caught"},
{"title": "Doctor Sleep (The Shining ..."},
{"title": "Demigods & Magicians: Percy ..."},
{"title": "Dear Mr. Knightley"},
{"title": "Daily Fantasy Sports"},
{"title": "Crazy Love: Overwhelmed by ..."},
{"title": "Cometh the Hour (The ..."},
{"title": "Code Name Verity (Code ..."},
{"title": "Clockwork Angel (The Infernal ..."},
{"title": "City of Glass (The ..."},
{"title": "City of Fallen Angels ..."},
{"title": "City of Bones (The ..."},
{"title": "City of Ashes (The ..."},
{"title": "Cell"},
{"title": "Catching Jordan (Hundred Oaks)"},
{"title": "Carry On, Warrior: Thoughts ..."},
{"title": "Carrie"},
{"title": "Buying In: The Secret ..."},
{"title": "Brain on Fire: My ..."},
{"title": "Batman: Europa"},
{"title": "Barefoot Contessa Back to ..."},
{"title": "Barefoot Contessa at Home: ..."},
{"title": "Balloon Animals"},
{"title": "Art Ops Vol. 1"},
{"title": "Aristotle and Dante Discover ..."},
{"title": "Angels Walking (Angels Walking ..."},
{"title": "Angels & Demons (Robert ..."},
{"title": "All the Light We ..."},
{"title": "Adulthood Is a Myth: ..."},
{"title": "Abstract City"},
{"title": "A Time of Torment ..."},
{"title": "A Study in Scarlet ..."},
{"title": "A Series of Catastrophes ..."},
{"title": "A People's History of ..."},
{"title": "A Man Called Ove"},
{"title": "A Distant Mirror: The ..."},
{"title": "A Brush of Wings ..."},
{"title": "1491: New Revelations of ..."},
{"title": "The Three Searches, Meaning, ..."},
{"title": "Searching for Meaning in ..."},
{"title": "Rook"},
{"title": "My Kitchen Year: 136 ..."},
{"title": "13 Hours: The Inside ..."},
{"title": "Will You Won't You ..."},
{"title": "Tipping Point for Planet ..."},
{"title": "The Star-Touched Queen"},
{"title": "The Silent Sister (Riley ..."},
{"title": "The Midnight Watch: A ..."},
{"title": "The Lonely City: Adventures ..."},
{"title": "The Gray Rhino: How ..."},
{"title": "The Golden Condom: And ..."},
{"title": "The Epidemic (The Program ..."},
{"title": "The Dinner Party"},
{"title": "The Diary of a ..."},
{"title": "The Children"},
{"title": "Stars Above (The Lunar ..."},
{"title": "Snatched: How A Drug ..."},
{"title": "Raspberry Pi Electronics Projects ..."},
{"title": "Quench Your Own Thirst: ..."},
{"title": "Psycho: Sanitarium (Psycho #1.5)"},
{"title": "Poisonous (Max Revere Novels ..."},
{"title": "One with You (Crossfire ..."},
{"title": "No Love Allowed (Dodge ..."},
{"title": "Murder at the 42nd ..."},
{"title": "Most Wanted"},
{"title": "Love, Lies and Spies"},
{"title": "How to Speak Golf: ..."},
{"title": "Hide Away (Eve Duncan ..."},
{"title": "Furiously Happy: A Funny ..."},
{"title": "Everyday Italian: 125 Simple ..."},
{"title": "Equal Is Unfair: America's ..."},
{"title": "Eleanor & Park"},
{"title": "Dirty (Dive Bar #1)"},
{"title": "Can You Keep a ..."},
{"title": "Boar Island (Anna Pigeon ..."},
{"title": "A Paris Apartment"},
{"title": "A la Mode: 120 ..."},
{"title": "Troublemaker: Surviving Hollywood and ..."},
{"title": "The Widow"},
{"title": "The Sleep Revolution: Transforming ..."},
{"title": "The Improbability of Love"},
{"title": "The Art of Startup ..."},
{"title": "Take Me Home Tonight ..."},
{"title": "Sleeping Giants (Themis Files ..."},
{"title": "Setting the World on ..."},
{"title": "Playing with Fire"},
{"title": "Off the Hook (Fishing ..."},
{"title": "Mothering Sunday"},
{"title": "Mother, Can You Not?"},
{"title": "M Train"},
{"title": "Lilac Girls"},
{"title": "Lies and Other Acts ..."},
{"title": "Lab Girl"},
{"title": "Keep Me Posted"},
{"title": "It Didn't Start with ..."},
{"title": "Grey (Fifty Shades #4)"},
{"title": "Exit, Pursued by a ..."},
{"title": "Daredevils"},
{"title": "Cravings: Recipes for What ..."},
{"title": "Born for This: How ..."},
{"title": "Arena"},
{"title": "Adultery"},
{"title": "A Mother's Reckoning: Living ..."},
{"title": "A Gentleman's Position (Society ..."},
{"title": "11/22/63"},
{"title": "10% Happier: How I ..."},
{"title": "10-Day Green Smoothie Cleanse: ..."},
{"title": "Without Shame"},
{"title": "Watchmen"},
{"title": "Unlimited Intuition Now"},
{"title": "Underlying Notes"},
{"title": "The Shack"},
{"title": "The New Brand You: ..."},
{"title": "The Moosewood Cookbook: Recipes ..."},
{"title": "The Flowers Lied"},
{"title": "The Fabric of the ..."},
{"title": "The Book of Mormon"},
{"title": "The Art and Science ..."},
{"title": "The Alien Club"},
{"title": "Suzie Snowflake: One beautiful ..."},
{"title": "Nap-a-Roo"},
{"title": "NaNo What Now? Finding ..."},
{"title": "Modern Day Fables"},
{"title": "If I Gave You ..."},
{"title": "Fruits Basket, Vol. 9 ..."},
{"title": "Dress Your Family in ..."},
{"title": "Don't Forget Steven"},
{"title": "Chernobyl 01:23:40: The Incredible ..."},
{"title": "Art and Fear: Observations ..."},
{"title": "A Shard of Ice ..."},
{"title": "A Hero's Curse (The ..."},
{"title": "23 Degrees South: A ..."},
{"title": "Zero to One: Notes ..."},
{"title": "Why Not Me?"},
{"title": "When Breath Becomes Air"},
{"title": "Vagabonding: An Uncommon Guide ..."},
{"title": "The Unlikely Pilgrimage of ..."},
{"title": "The New Drawing on ..."},
{"title": "The Midnight Assassin: Panic, ..."},
{"title": "The Martian (The Martian ..."},
{"title": "The High Mountains of ..."},
{"title": "The Grownup"},
{"title": "The E-Myth Revisited: Why ..."},
{"title": "South of Sunshine"},
{"title": "Smarter Faster Better: The ..."},
{"title": "Silence in the Dark ..."},
{"title": "Shadows of the Past ..."},
{"title": "Roller Girl"},
{"title": "Rising Strong"},
{"title": "Proofs of God: Classical ..."},
{"title": "Please Kill Me: The ..."},
{"title": "Out of Print: City ..."},
{"title": "My Life Next Door ..."},
{"title": "Miller's Valley"},
{"title": "Man's Search for Meaning"},
{"title": "Love That Boy: What ..."},
{"title": "Living Forward: A Proven ..."},
{"title": "Les Fleurs du Mal"},
{"title": "Left Behind (Left Behind ..."},
{"title": "Kill 'Em and Leave: ..."},
{"title": "Kierkegaard: A Christian Missionary ..."},
{"title": "John Vassos: Industrial Design ..."},
{"title": "I'll Give You the ..."},
{"title": "I Will Find You"},
{"title": "Hystopia: A Novel"},
{"title": "Howl and Other Poems"},
{"title": "History of Beauty"},
{"title": "Heaven is for Real: ..."},
{"title": "Future Shock (Future Shock ..."},
{"title": "Ender's Game (The Ender ..."},
{"title": "Diary of a Citizen ..."},
{"title": "Death by Leisure: A ..."},
{"title": "Brilliant Beacons: A History ..."},
{"title": "Brazen: The Courage to ..."},
{"title": "Between the World and ..."},
{"title": "Being Mortal: Medicine and ..."},
{"title": "A Murder Over a ..."},
{"title": "32 Yolks"},
{"title": "\"Most Blessed of the ..."},
{"title": "You Are a Badass: ..."},
{"title": "Wildlife of New York: ..."},
{"title": "What Happened on Beale ..."},
{"title": "Unreasonable Hope: Finding Faith ..."},
{"title": "Under the Tuscan Sun"},
{"title": "Toddlers Are A**holes: It's ..."},
{"title": "The Year of Living ..."},
{"title": "The Whale"},
{"title": "The Story of Art"},
{"title": "The Origin of Species"},
{"title": "The Great Gatsby"},
{"title": "The Good Girl"},
{"title": "The Glass Castle"},
{"title": "The Faith of Christopher ..."},
{"title": "The Drowning Girls"},
{"title": "The Constant Princess (The ..."},
{"title": "The Bourne Identity (Jason ..."},
{"title": "The Bachelor Girl's Guide ..."},
{"title": "The Art Book"},
{"title": "The 7 Habits of ..."},
{"title": "Team of Rivals: The ..."},
{"title": "Steal Like an Artist: ..."},
{"title": "Sit, Stay, Love"},
{"title": "Sister Dear"},
{"title": "Shrunken Treasures: Literary Classics, ..."},
{"title": "Rich Dad, Poor Dad"},
{"title": "Raymie Nightingale"},
{"title": "Playing from the Heart"},
{"title": "Nightstruck: A Novel"},
{"title": "Naturally Lean: 125 Nourishing ..."},
{"title": "Meternity"},
{"title": "Memoirs of a Geisha"},
{"title": "Like Never Before (Walker ..."},
{"title": "Life of Pi"},
{"title": "Leave This Song Behind: ..."},
{"title": "King's Folly (The Kinsman ..."},
{"title": "John Adams"},
{"title": "How to Cook Everything ..."},
{"title": "How to Be a ..."},
{"title": "Good in Bed (Cannie ..."},
{"title": "Fruits Basket, Vol. 7 ..."},
{"title": "For the Love: Fighting ..."},
{"title": "Finding God in the ..."},
{"title": "Every Heart a Doorway ..."},
{"title": "Delivering the Truth (Quaker ..."},
{"title": "Counted With the Stars ..."},
{"title": "Chronicles, Vol. 1"},
{"title": "Blue Like Jazz: Nonreligious ..."},
{"title": "Benjamin Franklin: An American ..."},
{"title": "At The Existentialist Caf\u00e9: ..."},
{"title": "A Summer In Europe"},
{"title": "A Short History of ..."},
{"title": "A Gathering of Shadows ..."},
{"title": "The Sound Of Love"},
{"title": "The Rise and Fall ..."},
{"title": "The Perks of Being ..."},
{"title": "The Mysterious Affair at ..."},
{"title": "The Man Who Mistook ..."},
{"title": "The Makings of a ..."},
{"title": "The Joy of Cooking"},
{"title": "The Invention of Wings"},
{"title": "The Hobbit (Middle-Earth Universe)"},
{"title": "The Great Railway Bazaar"},
{"title": "The Golden Compass (His ..."},
{"title": "The God Delusion"},
{"title": "The Girl You Left ..."},
{"title": "The Fellowship of the ..."},
{"title": "The Collected Poems of ..."},
{"title": "The Barefoot Contessa Cookbook"},
{"title": "Tell the Wolves I'm ..."},
{"title": "Ship Leaves Harbor: Essays ..."},
{"title": "Pride and Prejudice"},
{"title": "Musicophilia: Tales of Music ..."},
{"title": "Mere Christianity"},
{"title": "Me Before You (Me ..."},
{"title": "In the Woods (Dublin ..."},
{"title": "In Cold Blood"},
{"title": "How to Stop Worrying ..."},
{"title": "Give It Back"},
{"title": "Girl, Interrupted"},
{"title": "Fun Home: A Family ..."},
{"title": "Fruits Basket, Vol. 6 ..."},
{"title": "Deception Point"},
{"title": "Death Note, Vol. 6: ..."},
{"title": "Catherine the Great: Portrait ..."},
{"title": "Better Homes and Gardens ..."},
{"title": "An Unquiet Mind: A ..."},
{"title": "A Year in Provence ..."},
{"title": "World Without End (The ..."},
{"title": "Will Grayson, Will Grayson ..."},
{"title": "Why Save the Bankers?: ..."},
{"title": "Where She Went (If ..."},
{"title": "What If?: Serious Scientific ..."},
{"title": "Two Summers"},
{"title": "This Is Your Brain ..."},
{"title": "The Secret Garden"},
{"title": "The Raven King (The ..."},
{"title": "The Raven Boys (The ..."},
{"title": "The Power Greens Cookbook: ..."},
{"title": "The Metamorphosis"},
{"title": "The Mathews Men: Seven ..."},
{"title": "The Little Paris Bookshop"},
{"title": "The Hiding Place"},
{"title": "The Grand Design"},
{"title": "The Firm"},
{"title": "The Fault in Our ..."},
{"title": "The False Prince (The ..."},
{"title": "The Expatriates"},
{"title": "The Dream Thieves (The ..."},
{"title": "The Darkest Corners"},
{"title": "The Crossover"},
{"title": "The 5th Wave (The ..."},
{"title": "Tell the Wind and ..."},
{"title": "Tell Me Three Things"},
{"title": "Talking to Girls About ..."},
{"title": "Siddhartha"},
{"title": "Shiver (The Wolves of ..."},
{"title": "Remember Me?"},
{"title": "Red Dragon (Hannibal Lecter ..."},
{"title": "Peak: Secrets from the ..."},
{"title": "My Mother Was Nuts"},
{"title": "Mexican Today: New and ..."},
{"title": "Maybe Something Beautiful: How ..."},
{"title": "Lola and the Boy ..."},
{"title": "Logan Kade (Fallen Crest ..."},
{"title": "Last One Home (New ..."},
{"title": "Killing Floor (Jack Reacher ..."},
{"title": "Kill the Boy Band"},
{"title": "Isla and the Happily ..."},
{"title": "If I Stay (If ..."},
{"title": "I Know Why the ..."},
{"title": "Harry Potter and the ..."},
{"title": "Fruits Basket, Vol. 5 ..."},
{"title": "Foundation (Foundation (Publication Order) ..."},
{"title": "Fool Me Once"},
{"title": "Find Her (Detective D.D. ..."},
{"title": "Evicted: Poverty and Profit ..."},
{"title": "Drama"},
{"title": "Dracula the Un-Dead"},
{"title": "Digital Fortress"},
{"title": "Death Note, Vol. 5: ..."},
{"title": "Data, A Love Story: ..."},
{"title": "Critique of Pure Reason"},
{"title": "Booked"},
{"title": "Blue Lily, Lily Blue ..."},
{"title": "Approval Junkie: Adventures in ..."},
{"title": "An Abundance of Katherines"},
{"title": "America's War for the ..."},
{"title": "Alight (The Generations Trilogy ..."},
{"title": "A Girl's Guide to ..."},
{"title": "A Game of Thrones ..."},
{"title": "A Feast for Crows ..."},
{"title": "A Clash of Kings ..."},
{"title": "Vogue Colors A to ..."},
{"title": "The Shining (The Shining ..."},
{"title": "The Pilgrim's Progress"},
{"title": "The Perfect Play (Play ..."},
{"title": "The Passion of Dolssa"},
{"title": "The Jazz of Physics: ..."},
{"title": "The Hunger Games (The ..."},
{"title": "The Hound of the ..."},
{"title": "The Gunning of America: ..."},
{"title": "The Geography of Bliss: ..."},
{"title": "The Demonists (Demonist #1)"},
{"title": "The Demon Prince of ..."},
{"title": "The Bone Hunters (Lexy ..."},
{"title": "The Beast (Black Dagger ..."},
{"title": "Some Women"},
{"title": "Shopaholic Ties the Knot ..."},
{"title": "Paper and Fire (The ..."},
{"title": "Outlander (Outlander #1)"},
{"title": "Orchestra of Exiles: The ..."},
{"title": "No One Here Gets ..."},
{"title": "Night Shift (Night Shift ..."},
{"title": "Needful Things"},
{"title": "Mockingjay (The Hunger Games ..."},
{"title": "Misery"},
{"title": "Little Women (Little Women ..."},
{"title": "It"},
{"title": "Harry Potter and the ..."},
{"title": "Harry Potter and the ..."},
{"title": "Harry Potter and the ..."},
{"title": "Harry Potter and the ..."},
{"title": "Harry Potter and the ..."},
{"title": "Gone with the Wind"},
{"title": "God Is Not Great: ..."},
{"title": "Girl With a Pearl ..."},
{"title": "Fruits Basket, Vol. 4 ..."},
{"title": "Far From True (Promise ..."},
{"title": "Dark Lover (Black Dagger ..."},
{"title": "Confessions of a Shopaholic ..."},
{"title": "Changing the Game (Play ..."},
{"title": "Candide"},
{"title": "Can You Keep a ..."},
{"title": "Atlas Shrugged"},
{"title": "Animal Farm"},
{"title": "A Walk to Remember"},
{"title": "A New Earth: Awakening ..."},
{"title": "A History of God: ..."},
{"title": "'Salem's Lot"},
{"title": "Zero History (Blue Ant ..."},
{"title": "Wuthering Heights"},
{"title": "World War Z: An ..."},
{"title": "Wild: From Lost to ..."},
{"title": "Where'd You Go, Bernadette"},
{"title": "When You Are Engulfed ..."},
{"title": "We the People: The ..."},
{"title": "We Are All Completely ..."},
{"title": "Walk the Edge (Thunder ..."},
{"title": "Voyager (Outlander #3)"},
{"title": "Very Good Lives: The ..."},
{"title": "Vegan Vegetarian Omnivore: Dinner ..."},
{"title": "Unstuffed: Decluttering Your Home, ..."},
{"title": "Under the Banner of ..."},
{"title": "Two Boys Kissing"},
{"title": "Twilight (Twilight #1)"},
{"title": "Twenties Girl"},
{"title": "Trespassing Across America: One ..."},
{"title": "Three-Martini Lunch"},
{"title": "Thinking, Fast and Slow"},
{"title": "The Wild Robot"},
{"title": "The Wicked + The ..."},
{"title": "The Undomestic Goddess"},
{"title": "The Travelers"},
{"title": "The Tipping Point: How ..."},
{"title": "The Thing About Jellyfish"},
{"title": "The Stand"},
{"title": "The Smitten Kitchen Cookbook"},
{"title": "The Silkworm (Cormoran Strike ..."},
{"title": "The Sandman, Vol. 3: ..."},
{"title": "The Rose & the ..."},
{"title": "The Road to Little ..."},
{"title": "The Rise of Theodore ..."},
{"title": "The Restaurant at the ..."},
{"title": "The Rest Is Noise: ..."},
{"title": "The Red Tent"},
{"title": "The Purpose Driven Life: ..."},
{"title": "The Purest Hook (Second ..."},
{"title": "The Picture of Dorian ..."},
{"title": "The Paris Wife"},
{"title": "The Obsession"},
{"title": "The Nightingale"},
{"title": "The New Guy (and ..."},
{"title": "The Nanny Diaries (Nanny ..."},
{"title": "The Name of God ..."},
{"title": "The Maze Runner (The ..."},
{"title": "The Lover's Dictionary"},
{"title": "The Lonely Ones"},
{"title": "The Lean Startup: How ..."},
{"title": "The Last Painting of ..."},
{"title": "The Land of 10,000 ..."},
{"title": "The Infinities"},
{"title": "The Husband's Secret"},
{"title": "The Hitchhiker's Guide to ..."},
{"title": "The Guns of August"},
{"title": "The Guernsey Literary and ..."},
{"title": "The Goldfinch"},
{"title": "The Giver (The Giver ..."},
{"title": "The Girl with All ..."},
{"title": "The Girl Who Played ..."},
{"title": "The Girl Who Kicked ..."},
{"title": "The Exiled"},
{"title": "The End of Faith: ..."},
{"title": "The Elegant Universe: Superstrings, ..."},
{"title": "The Disappearing Spoon: And ..."},
{"title": "The Devil Wears Prada ..."},
{"title": "The Demon-Haunted World: Science ..."},
{"title": "The Day the Crayons ..."},
{"title": "The Da Vinci Code ..."},
{"title": "The Cuckoo's Calling (Cormoran ..."},
{"title": "The Complete Stories and ..."},
{"title": "The Complete Poems"},
{"title": "The Catcher in the ..."},
{"title": "The Cat in the ..."},
{"title": "The Case for Christ ..."},
{"title": "The Book Thief"},
{"title": "The Book of Basketball: ..."},
{"title": "The Blind Side: Evolution ..."},
{"title": "The Autobiography of Malcolm ..."},
{"title": "The Art of Simple ..."},
{"title": "The Art of Fielding"},
{"title": "Surely You're Joking, Mr. ..."},
{"title": "Stiff: The Curious Lives ..."},
{"title": "Spilled Milk: Based on ..."},
{"title": "Something Borrowed (Darcy & ..."},
{"title": "Something Blue (Darcy & ..."},
{"title": "Soldier (Talon #3)"},
{"title": "Shopaholic & Baby (Shopaholic ..."},
{"title": "Seven Days in the ..."},
{"title": "Seven Brief Lessons on ..."},
{"title": "Scarlet (The Lunar Chronicles ..."},
{"title": "Sarah's Key"},
{"title": "Saga, Volume 3 (Saga ..."},
{"title": "Running with Scissors"},
{"title": "Rogue Lawyer (Rogue Lawyer ..."},
{"title": "Rise of the Rocket ..."},
{"title": "Rework"},
{"title": "Reservations for Two"},
{"title": "Red: The True Story ..."},
{"title": "Ready Player One"},
{"title": "Quiet: The Power of ..."},
{"title": "Prodigy: The Graphic Novel ..."},
{"title": "Persepolis: The Story of ..."},
{"title": "Packing for Mars: The ..."},
{"title": "Outliers: The Story of ..."},
{"title": "Original Fake"},
{"title": "Orange Is the New ..."},
{"title": "One for the Money ..."},
{"title": "Notes from a Small ..."},
{"title": "Night (The Night Trilogy ..."},
{"title": "Neither Here nor There: ..."},
{"title": "Naked"},
{"title": "Morning Star (Red Rising ..."},
{"title": "Miracles from Heaven: A ..."},
{"title": "Midnight Riot (Peter Grant/ ..."},
{"title": "Me Talk Pretty One ..."},
{"title": "Manuscript Found in Accra"},
{"title": "Lust & Wonder"},
{"title": "Lila (Gilead #3)"},
{"title": "Life, the Universe and ..."},
{"title": "Life Without a Recipe"},
{"title": "Life After Life"},
{"title": "Letter to a Christian ..."},
{"title": "Let's Pretend This Never ..."},
{"title": "Legend (Legend #1)"},
{"title": "Lean In: Women, Work, ..."},
{"title": "Lamb: The Gospel According ..."},
{"title": "Lady Renegades (Rebel Belle ..."},
{"title": "Jurassic Park (Jurassic Park ..."},
{"title": "It's Never Too Late ..."},
{"title": "Is Everyone Hanging Out ..."},
{"title": "Into the Wild"},
{"title": "Inferno (Robert Langdon #4)"},
{"title": "In the Garden of ..."},
{"title": "If I Run (If ..."},
{"title": "I've Got Your Number"},
{"title": "I Am Malala: The ..."},
{"title": "Hungry Girl Clean & ..."},
{"title": "House of Lost Worlds: ..."},
{"title": "House of Leaves"},
{"title": "Horrible Bear!"},
{"title": "Holidays on Ice"},
{"title": "Heir to the Sky"},
{"title": "Green Eggs and Ham ..."},
{"title": "Grayson, Vol 3: Nemesis ..."},
{"title": "Gratitude"},
{"title": "Gone Girl"},
{"title": "Golden (Heart of Dread ..."},
{"title": "Girl in the Blue ..."},
{"title": "Fruits Basket, Vol. 3 ..."},
{"title": "Friday Night Lights: A ..."},
{"title": "Fire Bound (Sea Haven/Sisters ..."},
{"title": "Fifty Shades Freed (Fifty ..."},
{"title": "Fellside"},
{"title": "Extreme Prey (Lucas Davenport ..."},
{"title": "Eragon (The Inheritance Cycle ..."},
{"title": "Eclipse (Twilight #3)"},
{"title": "Dune (Dune #1)"},
{"title": "Dracula"},
{"title": "Do Androids Dream of ..."},
{"title": "Disrupted: My Misadventure in ..."},
{"title": "Dead Wake: The Last ..."},
{"title": "David and Goliath: Underdogs, ..."},
{"title": "Darkfever (Fever #1)"},
{"title": "Dark Places"},
{"title": "Crazy Rich Asians (Crazy ..."},
{"title": "Counting Thyme"},
{"title": "Cosmos"},
{"title": "Civilization and Its Discontents"},
{"title": "Cinder (The Lunar Chronicles ..."},
{"title": "Catastrophic Happiness: Finding Joy ..."},
{"title": "Career of Evil (Cormoran ..."},
{"title": "Breaking Dawn (Twilight #4)"},
{"title": "Brave Enough"},
{"title": "Boy Meets Boy"},
{"title": "Born to Run: A ..."},
{"title": "Blink: The Power of ..."},
{"title": "Black Flags: The Rise ..."},
{"title": "Black Butler, Vol. 1 ..."},
{"title": "Big Little Lies"},
{"title": "Between Shades of Gray"},
{"title": "Best of My Love ..."},
{"title": "Beowulf"},
{"title": "Beautiful Creatures (Caster Chronicles ..."},
{"title": "Awkward"},
{"title": "Ash"},
{"title": "Are We There Yet?"},
{"title": "Are We Smart Enough ..."},
{"title": "Annie on My Mind"},
{"title": "And Then There Were ..."},
{"title": "A Walk in the ..."},
{"title": "A Visit from the ..."},
{"title": "A Storm of Swords ..."},
{"title": "A Heartbreaking Work of ..."},
{"title": "8 Keys to Mental ..."},
{"title": "#GIRLBOSS"},
{"title": "The Suffragettes (Little Black ..."},
{"title": "The Sense of an ..."},
{"title": "The Sandman, Vol. 2: ..."},
{"title": "The Course of Love"},
{"title": "Sugar Rush (Offensive Line ..."},
{"title": "Saga, Volume 2 (Saga ..."},
{"title": "Run, Spot, Run: The ..."},
{"title": "New Moon (Twilight #2)"},
{"title": "Life"},
{"title": "Kindle Paperwhite User's Guide"},
{"title": "H is for Hawk"},
{"title": "Girl Online On Tour ..."},
{"title": "Fruits Basket, Vol. 2 ..."},
{"title": "Diary of a Minecraft ..."},
{"title": "Y: The Last Man, ..."},
{"title": "While You Were Mine"},
{"title": "Where Lightning Strikes (Bleeding ..."},
{"title": "When I'm Gone"},
{"title": "Ways of Seeing"},
{"title": "Vampire Knight, Vol. 1 ..."},
{"title": "Vampire Girl (Vampire Girl ..."},
{"title": "Twenty Love Poems and ..."},
{"title": "Travels with Charley: In ..."},
{"title": "Three Wishes (River of ..."},
{"title": "This One Moment (Pushing ..."},
{"title": "The Zombie Room"},
{"title": "The Wicked + The ..."},
{"title": "The Tumor"},
{"title": "The Story of Hong ..."},
{"title": "The Silent Wife"},
{"title": "The Silent Twin (Detective ..."},
{"title": "The Selfish Gene"},
{"title": "The Secret Healer"},
{"title": "The Sandman, Vol. 1: ..."},
{"title": "The Republic"},
{"title": "The Odyssey"},
{"title": "The No. 1 Ladies' ..."},
{"title": "The Nicomachean Ethics"},
{"title": "The Name of the ..."},
{"title": "The Mirror & the ..."},
{"title": "The Little Prince"},
{"title": "The Light of the ..."},
{"title": "The Last Girl (The ..."},
{"title": "The Iliad"},
{"title": "The Hook Up (Game ..."},
{"title": "The Haters"},
{"title": "The Girl You Lost"},
{"title": "The Girl In The ..."},
{"title": "The End of the ..."},
{"title": "The Edge of Reason ..."},
{"title": "The Complete Maus (Maus ..."},
{"title": "The Communist Manifesto"},
{"title": "The Bhagavad Gita"},
{"title": "The Bette Davis Club"},
{"title": "The Art of Not ..."},
{"title": "Taking Shots (Assassins #1)"},
{"title": "Starlark"},
{"title": "Skip Beat!, Vol. 01 ..."},
{"title": "Sister Sable (The Mad ..."},
{"title": "Shatter Me (Shatter Me ..."},
{"title": "Shameless"},
{"title": "Shadow Rites (Jane Yellowrock ..."},
{"title": "Settling the Score (The ..."},
{"title": "Sense and Sensibility"},
{"title": "Saga, Volume 1 (Saga ..."},
{"title": "Rhythm, Chord & Malykhin"},
{"title": "Rat Queens, Vol. 1: ..."},
{"title": "Paradise Lost (Paradise #1)"},
{"title": "Paper Girls, Vol. 1 ..."},
{"title": "Ouran High School Host ..."},
{"title": "Origins (Alphas 0.5)"},
{"title": "One Second (Seven #7)"},
{"title": "On the Road (Duluoz ..."},
{"title": "Old Records Never Die: ..."},
{"title": "Off Sides (Off #1)"},
{"title": "Of Mice and Men"},
{"title": "Myriad (Prentor #1)"},
{"title": "My Perfect Mistake (Over ..."},
{"title": "Ms. Marvel, Vol. 1: ..."},
{"title": "Meditations"},
{"title": "Matilda"},
{"title": "Lost Among the Living"},
{"title": "Lord of the Flies"},
{"title": "Listen to Me (Fusion ..."},
{"title": "Kitchens of the Great ..."},
{"title": "Jane Eyre"},
{"title": "Imperfect Harmony"},
{"title": "Icing (Aces Hockey #2)"},
{"title": "Hawkeye, Vol. 1: My ..."},
{"title": "Having the Barbarian's Baby ..."},
{"title": "Giant Days, Vol. 1 ..."},
{"title": "Fruits Basket, Vol. 1 ..."},
{"title": "Frankenstein"},
{"title": "Forever Rockers (The Rocker ..."},
{"title": "Fighting Fate (Fighting #6)"},
{"title": "Emma"},
{"title": "Eat, Pray, Love"},
{"title": "Deep Under (Walker Security ..."},
{"title": "Choosing Our Religion: The ..."},
{"title": "Charlie and the Chocolate ..."},
{"title": "Charity's Cross (Charles Towne ..."},
{"title": "Bright Lines"},
{"title": "Bridget Jones's Diary (Bridget ..."},
{"title": "Bounty (Colorado Mountain #7)"},
{"title": "Blood Defense (Samantha Brinkman ..."},
{"title": "Bleach, Vol. 1: Strawberry ..."},
{"title": "Beyond Good and Evil"},
{"title": "Alice in Wonderland (Alice's ..."},
{"title": "Ajin: Demi-Human, Volume 1 ..."},
{"title": "A Spy's Devotion (The ..."},
{"title": "1st to Die (Women's ..."},
{"title": "1,000 Places to See ..."}
]

75
crawler/mk.py Executable file
View File

@ -0,0 +1,75 @@
#!/bin/python3
import sys
import json
import numpy as np
with open("videospiele.json") as f:
lines = [ line.strip() for line in f.readlines()]
print(lines[0])
lines = " ".join(lines)
adjlist = json.loads(lines)
def colorize(word):
return f"\u001b[0;33m{word}\u001b[0m"
def softmax(distr):
temperature = 1.5
def f(x):
return np.exp(x / temperature)
for i in range(len(distr)):
distr[i] = f(distr[i])
Σ = sum(distr)
distr /= Σ
return distr
indices = set([ *adjlist.keys(), *sum(adjlist.values(), [])])
id_to_link = dict(enumerate(indices))
link_to_id = dict([ [word, idx] for [idx, word] in enumerate(indices)])
N = len(indices)
from numpy import matrix as M, array
print("allocating array")
m = array([0])
m.resize(N, N)
M = array([0.0])
M.resize(N, N)
print("processing bigrams")
for i in indices:
if i not in adjlist: continue
for j in adjlist[i]:
# i -> j
iid, jid = link_to_id[i], link_to_id[j]
#print(f"{i} {iid} -> {j} {jid}")
m[iid, jid] = 1
print("normalizing matrix")
d = 0.85
uniform = np.full(N, 1/N)
for i in range(N):
row = m[i]
Σ = sum(row)
if Σ == 0:
M[i] = uniform
else:
M[i] = (1-d)*uniform + d * m[i] * (1.0/Σ)
print("Done preparing")
from numpy import linalg as LA
print("Computing eigenvalues")
ews, evs = LA.eig(M.transpose())
ews_1 = [ (i, v) for (i, v) in enumerate(ews) if abs(v - 1) < 0.00001 ]
print(f"Found {len(ews_1)} eigenvectors to eigenvalue 1")
ew_idx = ews_1[0][0]
ev = evs[:,ew_idx]
print("Normalizing eigenvalue to a PDF")
ev = np.real(ev / sum(ev))
print("Stored eigenvector for eigenvalue 1 in `ev`")
print("Verify that `ev @ M == ev`")
print("All pages with P > 1/N")
hits = sorted([ (f"{v*100:.2f}%", id_to_link[i] ) for (i, v) in enumerate(ev) if v > 1/N ])

0
crawler/out.json Normal file
View File

1
crawler/videospiele.json Normal file

File diff suppressed because one or more lines are too long

12326
hp/hodbook.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,10 @@
#!/bin/python3
import sys
if len(sys.argv) < 2:
print(sys.argv)
exit("Please provide a filename as argument")
filename = sys.argv[-1]
def colorize(word):
return f"\u001b[0;33m{word}\u001b[0m"
@ -16,7 +22,7 @@ import re
chapter_pattern = re.compile("^CHAPTER")
page_pattern = re.compile("^\d+$")
with open("hp.txt") as f:
with open(filename) as f:
lines = f.readlines()
print("Stripping excess data")