test version

adamwulf · Jun 23, 2014 · 67d17ee · 67d17ee
1 parent 051f439
commit 67d17ee
Show file tree

Hide file tree

Showing 7 changed files with 365 additions and 27 deletions.
diff --git a/compute-accuracy-syntax.c b/compute-accuracy-syntax.c
@@ -0,0 +1,262 @@
+//  Copyright 2013 Google Inc. All Rights Reserved.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <malloc.h>
+#include <ctype.h>
+
+const long long max_size = 2000;         // max length of strings
+const long long N = 1;                   // number of closest words
+const long long max_w = 50;              // max length of vocabulary entries
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+	float dist, len, bestd[N], vec[max_size];
+	long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
+	float *M;
+	char *vocab;
+	int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
+	int small = 0;
+
+
+	if (argc < 2) {
+		printf("Usage: ./compute-accuracy <FILE> <threshold> <small>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
+		return 0;
+	}
+
+	strcpy(file_name, argv[1]);
+
+	if (argc > 2)
+		threshold = atoi(argv[2]);
+
+	if (argc > 3)
+		small = 1;
+
+	f = fopen(file_name, "rb");
+
+	if (f == NULL) {
+		printf("Input file not found\n");
+		return -1;
+	}
+
+	fscanf(f, "%lld", &words);
+
+	if (threshold)
+		if (words > threshold)
+			words = threshold;
+
+	fscanf(f, "%lld", &size);
+
+	vocab = (char *)malloc(words * max_w * sizeof(char));
+
+	M = (float *)malloc(words * size * sizeof(float));
+
+	if (M == NULL) {
+		printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
+		return -1;
+	}
+
+	for (b = 0; b < words; b++) {
+
+		fscanf(f, "%s%c", &vocab[b * max_w], &ch);
+
+		for (a = 0; a < max_w; a++)
+			vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
+
+		for (a = 0; a < size; a++)
+			fread(&M[a + b * size], sizeof(float), 1, f);
+
+		len = 0;
+
+		for (a = 0; a < size; a++)
+			len += M[a + b * size] * M[a + b * size];
+
+		len = sqrt(len);
+
+		for (a = 0; a < size; a++)
+			M[a + b * size] /= len;
+	}
+
+	fclose(f);
+
+	TCN = 0;
+
+	while (1) {
+
+		for (a = 0; a < N; a++)
+			bestd[a] = 0;
+
+		for (a = 0; a < N; a++)
+			bestw[a][0] = 0;
+
+		scanf("%s", st1);
+
+		for (a = 0; a < strlen(st1); a++)
+			st1[a] = toupper(st1[a]);
+
+		if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
+
+			if (TCN == 0)
+			TCN = 1;
+
+			if (QID != 0){ 
+				if(small ==0)
+					printf("%.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+				else
+					printf("%.2f\t", CCN / (float)TCN * 100);
+			}
+
+
+
+		QID++;
+		scanf("%s", st1);
+
+		if (feof(stdin))
+			break;
+
+		if(small==0)
+			printf("%s\t", st1);
+
+
+		TCN = 0;
+		CCN = 0;
+
+		continue;
+
+		}
+
+		if (!strcmp(st1, "EXIT"))
+			break;
+
+		scanf("%s", st2);
+
+		for (a = 0; a < strlen(st2); a++)
+			st2[a] = toupper(st2[a]);
+
+		scanf("%s", st3);
+
+		for (a = 0; a<strlen(st3); a++)
+			st3[a] = toupper(st3[a]);
+
+		scanf("%s", st4);
+
+		for (a = 0; a < strlen(st4); a++)
+			st4[a] = toupper(st4[a]);
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st1))
+				break;
+
+		b1 = b;
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st2))
+				break;
+
+		b2 = b;
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st3))
+				break;
+
+		b3 = b;
+
+		for (a = 0; a < N; a++)
+			bestd[a] = 0;
+
+		for (a = 0; a < N; a++)
+			bestw[a][0] = 0;
+
+		TQ++;
+
+		if (b1 == words)
+			continue;
+
+		if (b2 == words)
+			continue;
+
+		if (b3 == words)
+			continue;
+
+		for (b = 0; b < words; b++) 
+			if (!strcmp(&vocab[b * max_w], st4))
+				break;
+
+		if (b == words)
+			continue;
+
+		for (a = 0; a < size; a++)
+			vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
+
+		TQS++;
+
+		for (c = 0; c < words; c++) {
+
+			if (c == b1)
+				continue;
+
+			if (c == b2)
+				continue;
+
+			if (c == b3)
+				continue;
+
+			dist = 0;
+
+			for (a = 0; a < size; a++)
+				dist += vec[a] * M[a + c * size];
+
+			for (a = 0; a < N; a++) {
+
+				if (dist > bestd[a]) {
+
+					for (d = N - 1; d > a; d--) {
+						bestd[d] = bestd[d - 1];
+						strcpy(bestw[d], bestw[d - 1]);
+					}
+
+					bestd[a] = dist;
+					strcpy(bestw[a], &vocab[c * max_w]);
+					break;
+				}
+			}
+		}
+
+		if (!strcmp(st4, bestw[0])) {
+			CCN++;
+			CACN++;
+			SYAC++;
+		}	
+
+
+		SYCN++;
+		TCN++;
+		TACN++;
+
+	}
+
+	if(small == 0){
+		printf("Total accuracy: %.2f %%\n", CACN / (float)TACN * 100);
+		printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
+	}else{
+		printf("%.2f\n",CACN / (float)TACN * 100);
+	}
+	return 0;
+
+}
diff --git a/compute-accuracy.c b/compute-accuracy.c
@@ -26,15 +26,15 @@ const long long max_w = 50;              // max length of vocabulary entries
 int main(int argc, char **argv)
 {
   FILE *f;
-  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size],output_file[max_size], ch;
   float dist, len, bestd[N], vec[max_size];
   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
   float *M;
   char *vocab;
   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
-
+  int small_print =0;
   if (argc < 2) {
-    printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
+    printf("Usage: ./compute-accuracy <FILE> <threshold> <small_print>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
     return 0;
   }
 
@@ -43,6 +43,9 @@ int main(int argc, char **argv)
   if (argc > 2)
     threshold = atoi(argv[2]);
 
+  if (argc > 3)
+    small_print = 1; //output is smaller
+
   f = fopen(file_name, "rb");
 
   if (f == NULL) {
@@ -91,7 +94,8 @@ int main(int argc, char **argv)
   fclose(f);
 
   TCN = 0;
-
+  if(small_print)
+    printf("Type\tAccuracy(top1)%%\tTotal Acc%%\tSemantic Acc%%\tSyntactic Acc%%\tSuccess\tTotal\n");
   while (1) {
 
     for (a = 0; a < N; a++)
@@ -111,8 +115,12 @@ int main(int argc, char **argv)
         TCN = 1;
 
       if (QID != 0) {
-        printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
-        printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+        if(small_print){
+          printf("%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\n", CCN / (float)TCN * 100,CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100, CCN, TCN);
+        }else{
+          printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+          printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+        }
       }
 
       QID++;
@@ -121,7 +129,11 @@ int main(int argc, char **argv)
       if (feof(stdin))
         break;
 
-      printf("%s:\n", st1);
+      if(small_print)
+        printf("%s\t", st1);
+      else
+        printf("%s:\n", st1);
+
       TCN = 0;
       CCN = 0;
       continue;

diff --git a/demo-word-accuracy.sh b/demo-word-accuracy.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
   gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1 -ngram 8 -hashbang 1 -min-count 0 
+time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1
 ./compute-accuracy /tmp/vectors.bin 30000 < questions-words.txt
 # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
diff --git a/makefile b/makefile
@@ -1,9 +1,9 @@
 CC = gcc
 #The -Ofast might not work with older versions of gcc; in that case, use -O2
-CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result -g
+CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
 #
 
-all: word2vec word2phrase distance word-analogy compute-accuracy
+all: word2vec word2phrase distance word-analogy compute-accuracy compute-accuracy-syntax 
 
 word2vec : word2vec.c
 	$(CC) word2vec.c -o word2vec $(CFLAGS)
@@ -15,7 +15,9 @@ word-analogy : word-analogy.c
 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
 compute-accuracy : compute-accuracy.c
 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
+compute-accuracy-syntax : compute-accuracy-syntax.c
+	$(CC) compute-accuracy-syntax.c -o compute-accuracy-syntax $(CFLAGS)
 	chmod +x *.sh
 
 clean:
-	rm -rf word2vec word2phrase distance word-analogy compute-accuracy
+	rm -rf word2vec word2phrase distance word-analogy compute-accuracy compute-accuracy-syntax
diff --git a/test-ngram-w2vec.py b/test-ngram-w2vec.py
@@ -0,0 +1,36 @@
+#test-ngram-w2vec.py
+import subprocess as sp
+import numpy as np
+
+sizes = range(200,350,50)
+samples = ["0","1e-5"]
+negatives = range(0,10,5)
+alphas = np.arange(0.025,0.060,0.015)
+ngrams = range(2,5,1)
+hashbs = [0,1]
+cbows = [0,1]
+hsE = [0,1]
+
+
+cpt = 1
+logFile = open("results.txt" , "w")
+lofFile2 = open("parameters.txt", "w")
+lofFile2.write("size\tsample\tnegative\talpha\tngram\thashbang\tcbow\ths\n");
+for size in sizes:
+	for sample in samples:
+		for negative in negatives:
+			for hs in hsE:
+				if negative == 0 and hs == 0:
+					continue;
+				for alpha in alphas:
+					for ngram in ngrams:
+						for hashb in hashbs:
+							for cbow in cbows:
+									print "iteration %d on 649" % (cpt)
+									argsLine= "./testNgrams.sh %s %s %s %s %s %s %s %s" % (str(size),str(sample),str(negative),str(alpha),str(ngram),str(hashb),str(cbow),str(hs))
+									argu= "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (str(size),str(sample),str(negative),str(alpha),str(ngram),str(hashb),str(cbow),str(hs))
+									lofFile2.write(argu);
+									sp.call(args=argsLine,shell=True,stdout=logFile)
+									cpt = cpt+1
+
+