Implement HP

2023-06-06 14:09:08 +02:00 · 2023-06-06 14:09:08 +02:00 · 6ae93e74f5
commit 6ae93e74f5
parent 5f5e96c0cf
3 changed files with 8022 additions and 0 deletions
--- a/hp/hp.py
+++ b/hp/hp.py
@ -0,0 +1,123 @@
+#!/bin/python3
+
+def colorize(word):
+   return f"\u001b[0;33m{word}\u001b[0m" 
+
+def softmax(distr):
+    temperature = 1.5
+    def f(x):
+        return np.exp(x / temperature)
+    for i in range(len(distr)):
+        distr[i] = f(distr[i])
+    Σ = sum(distr)
+    distr /= Σ
+    return distr
+import re
+chapter_pattern = re.compile("^CHAPTER")
+page_pattern = re.compile("^\d+$")
+
+with open("hp.txt") as f:
+    lines = f.readlines()
+
+print("Stripping excess data")
+truelines = []
+i = 0
+i += 1
+while i < len(lines):
+    line = lines[i].strip()
+    #print(f"read {i} {line}")
+    i += 1
+    if match := chapter_pattern.match(line):
+        #print(f"toss {line}")
+        #print(f"toss {lines[i]}")
+        i += 1 # ditch one extra line
+        continue
+    if match := page_pattern.match(line):
+        #print(f"toss {line}")
+        continue
+    truelines.append(line)
+
+print("Parsing lines")
+corpus = " ".join(truelines)
+tokens = corpus.split(" ")
+words = set(tokens)
+id_to_word = dict(enumerate(words))
+word_to_id = dict([ [word, idx] for [idx, word] in enumerate(words)])
+
+N = len(words)
+from numpy import matrix as M, array
+#m = M( [ [ 0 for _ in range(len(words)) ] for _ in range(len(words)) ] )
+
+print("allocating array")
+m = array([0])
+m.resize(N, N)
+M = array([0.0])
+M.resize(N, N)
+
+print("processing bigrams")
+bigrams = zip(tokens, tokens[1:])
+for a, b in bigrams:
+    ida, idb = word_to_id[a], word_to_id[b]
+    m[ida, idb] += 1
+
+print("normalizing matrix")
+for i in range(N):
+    row = m[i]
+    Σ = sum(row)
+    if Σ == 0: continue
+    M[i] = m[i] * (1.0/Σ)
+
+print("Done preparing")
+
+# randomwalker
+from time import sleep
+from numpy.random import random as uniform
+import numpy as np
+#word = id_to_word[0]
+
+infty = 1_000_000
+def simulate(softmax_enabled = True):
+    word_id = 0
+    try:
+        while True:
+            distr = [ 1 if i == word_id else 0 for i in range(N) ]
+            next_distr = array(distr) @ M
+            if softmax_enabled:
+                next_distr = softmax([ v if v != 0 else -infty for v in next_distr ])
+            U = uniform()
+            prefixsum = np.cumsum(next_distr)
+            word_id = next(i for i, v in enumerate(prefixsum) if v >= U)
+            word = id_to_word[word_id]
+            print(word if next_distr[word_id] == 1 else colorize(word), end=" ", flush=True)
+    except KeyboardInterrupt:
+        print()
+        print("I'm dying!")
+
+def interactive(softmax_enabled = True):
+    word_id = 0
+    text = []
+    try:
+        while True:
+            distr = [ 1 if i == word_id else 0 for i in range(N) ]
+            next_distr = array(distr) @ M
+            if softmax_enabled:
+                next_distr = softmax([ v if v != 0 else -infty for v in next_distr ])
+            prefixsum = np.cumsum(next_distr)
+            def pick():
+                U = uniform()
+                return next(i for i, v in enumerate(prefixsum) if v >= U)
+            choices = list(set([pick(), pick(), pick(), pick(), pick()]))
+            print("Pick from")
+            for i, v in enumerate(choices):
+                print(f"    {i}: {id_to_word[v].ljust(10)} ({next_distr[v]*100:.2f}%)")
+            try:
+                word_id = choices[int(input())]
+            except (ValueError, IndexError):
+                word_id = choices[0]
+            word = id_to_word[word_id]
+            text.append(word)
+            print()
+            print(" ".join(text))
+    except KeyboardInterrupt:
+        print()
+        print("I'm dying!")
--- a/hp/hp.txt
+++ b/hp/hp.txt
--- a/hp/test.txt
+++ b/hp/test.txt
@ -0,0 +1,2 @@
+
+hallo ich teste hier ganz lustige leichte worte die man einfach wieder teste kann und worte machen kann und sachen machen ich
				`@ -0,0 +1,2 @@`

				`hallo ich teste hier ganz lustige leichte worte die man einfach wieder teste kann und worte machen kann und sachen machen ich`