forschungstage-2023/crawler/markov.py
2023-06-16 12:32:22 +02:00

76 lines
1.8 KiB
Python
Executable File

#!/bin/python3
import sys
import json
import numpy as np
with open("videospiele.json") as f:
lines = [ line.strip() for line in f.readlines()]
print(lines[0])
lines = " ".join(lines)
adjlist = json.loads(lines)
def colorize(word):
return f"\u001b[0;33m{word}\u001b[0m"
def softmax(distr):
temperature = 1.5
def f(x):
return np.exp(x / temperature)
for i in range(len(distr)):
distr[i] = f(distr[i])
Σ = sum(distr)
distr /= Σ
return distr
indices = set([ *adjlist.keys(), *sum(adjlist.values(), [])])
id_to_link = dict(enumerate(indices))
link_to_id = dict([ [word, idx] for [idx, word] in enumerate(indices)])
N = len(indices)
from numpy import matrix as M, array
print("allocating array")
m = array([0])
m.resize(N, N)
M = array([0.0])
M.resize(N, N)
print("processing bigrams")
for i in indices:
if i not in adjlist: continue
for j in adjlist[i]:
# i -> j
iid, jid = link_to_id[i], link_to_id[j]
#print(f"{i} {iid} -> {j} {jid}")
m[iid, jid] = 1
print("normalizing matrix")
d = 0.85
uniform = np.full(N, 1/N)
for i in range(N):
row = m[i]
Σ = sum(row)
if Σ == 0:
M[i] = uniform
else:
M[i] = (1-d)*uniform + d * m[i] * (1.0/Σ)
print("Done preparing")
from numpy import linalg as LA
print("Computing eigenvalues")
ews, evs = LA.eig(M.transpose())
ews_1 = [ (i, v) for (i, v) in enumerate(ews) if abs(v - 1) < 0.00001 ]
print(f"Found {len(ews_1)} eigenvectors to eigenvalue 1")
ew_idx = ews_1[0][0]
ev = evs[:,ew_idx]
print("Normalizing eigenvalue to a PDF")
ev = np.real(ev / sum(ev))
print("Stored eigenvector for eigenvalue 1 in `ev`")
print("Verify that `ev @ M == ev`")
print("All pages with P > 1/N")
hits = sorted([ (f"{v*100:.2f}%", id_to_link[i] ) for (i, v) in enumerate(ev) if v > 1/N ])