Implement HP
This commit is contained in:
parent
5f5e96c0cf
commit
6ae93e74f5
123
hp/hp.py
Executable file
123
hp/hp.py
Executable file
@ -0,0 +1,123 @@
|
||||
#!/bin/python3
|
||||
|
||||
def colorize(word):
|
||||
return f"\u001b[0;33m{word}\u001b[0m"
|
||||
|
||||
def softmax(distr):
|
||||
temperature = 1.5
|
||||
def f(x):
|
||||
return np.exp(x / temperature)
|
||||
for i in range(len(distr)):
|
||||
distr[i] = f(distr[i])
|
||||
Σ = sum(distr)
|
||||
distr /= Σ
|
||||
return distr
|
||||
import re
|
||||
chapter_pattern = re.compile("^CHAPTER")
|
||||
page_pattern = re.compile("^\d+$")
|
||||
|
||||
with open("hp.txt") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
print("Stripping excess data")
|
||||
truelines = []
|
||||
i = 0
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
#print(f"read {i} {line}")
|
||||
i += 1
|
||||
if match := chapter_pattern.match(line):
|
||||
#print(f"toss {line}")
|
||||
#print(f"toss {lines[i]}")
|
||||
i += 1 # ditch one extra line
|
||||
continue
|
||||
if match := page_pattern.match(line):
|
||||
#print(f"toss {line}")
|
||||
continue
|
||||
truelines.append(line)
|
||||
|
||||
print("Parsing lines")
|
||||
corpus = " ".join(truelines)
|
||||
tokens = corpus.split(" ")
|
||||
words = set(tokens)
|
||||
id_to_word = dict(enumerate(words))
|
||||
word_to_id = dict([ [word, idx] for [idx, word] in enumerate(words)])
|
||||
|
||||
N = len(words)
|
||||
from numpy import matrix as M, array
|
||||
#m = M( [ [ 0 for _ in range(len(words)) ] for _ in range(len(words)) ] )
|
||||
|
||||
print("allocating array")
|
||||
m = array([0])
|
||||
m.resize(N, N)
|
||||
M = array([0.0])
|
||||
M.resize(N, N)
|
||||
|
||||
print("processing bigrams")
|
||||
bigrams = zip(tokens, tokens[1:])
|
||||
for a, b in bigrams:
|
||||
ida, idb = word_to_id[a], word_to_id[b]
|
||||
m[ida, idb] += 1
|
||||
|
||||
print("normalizing matrix")
|
||||
for i in range(N):
|
||||
row = m[i]
|
||||
Σ = sum(row)
|
||||
if Σ == 0: continue
|
||||
M[i] = m[i] * (1.0/Σ)
|
||||
|
||||
print("Done preparing")
|
||||
|
||||
# randomwalker
|
||||
from time import sleep
|
||||
from numpy.random import random as uniform
|
||||
import numpy as np
|
||||
#word = id_to_word[0]
|
||||
|
||||
infty = 1_000_000
|
||||
def simulate(softmax_enabled = True):
|
||||
word_id = 0
|
||||
try:
|
||||
while True:
|
||||
distr = [ 1 if i == word_id else 0 for i in range(N) ]
|
||||
next_distr = array(distr) @ M
|
||||
if softmax_enabled:
|
||||
next_distr = softmax([ v if v != 0 else -infty for v in next_distr ])
|
||||
U = uniform()
|
||||
prefixsum = np.cumsum(next_distr)
|
||||
word_id = next(i for i, v in enumerate(prefixsum) if v >= U)
|
||||
word = id_to_word[word_id]
|
||||
print(word if next_distr[word_id] == 1 else colorize(word), end=" ", flush=True)
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
print("I'm dying!")
|
||||
|
||||
def interactive(softmax_enabled = True):
|
||||
word_id = 0
|
||||
text = []
|
||||
try:
|
||||
while True:
|
||||
distr = [ 1 if i == word_id else 0 for i in range(N) ]
|
||||
next_distr = array(distr) @ M
|
||||
if softmax_enabled:
|
||||
next_distr = softmax([ v if v != 0 else -infty for v in next_distr ])
|
||||
prefixsum = np.cumsum(next_distr)
|
||||
def pick():
|
||||
U = uniform()
|
||||
return next(i for i, v in enumerate(prefixsum) if v >= U)
|
||||
choices = list(set([pick(), pick(), pick(), pick(), pick()]))
|
||||
print("Pick from")
|
||||
for i, v in enumerate(choices):
|
||||
print(f" {i}: {id_to_word[v].ljust(10)} ({next_distr[v]*100:.2f}%)")
|
||||
try:
|
||||
word_id = choices[int(input())]
|
||||
except (ValueError, IndexError):
|
||||
word_id = choices[0]
|
||||
word = id_to_word[word_id]
|
||||
text.append(word)
|
||||
print()
|
||||
print(" ".join(text))
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
print("I'm dying!")
|
2
hp/test.txt
Normal file
2
hp/test.txt
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
hallo ich teste hier ganz lustige leichte worte die man einfach wieder teste kann und worte machen kann und sachen machen ich
|
Loading…
Reference in New Issue
Block a user