From 67d17eeecdf71e7694263e061d796a079a7a4a3a Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Mon, 23 Jun 2014 16:08:18 +0200
Subject: [PATCH] test version

---
 compute-accuracy-syntax.c | 262 ++++++++++++++++++++++++++++++++++++++
 compute-accuracy.c        |  26 +++-
 demo-word-accuracy.sh     |   2 +-
 makefile                  |   8 +-
 test-ngram-w2vec.py       |  36 ++++++
 testNgrams.sh             |  16 +++
 word2vec.c                |  42 +++---
 7 files changed, 365 insertions(+), 27 deletions(-)
 create mode 100644 compute-accuracy-syntax.c
 create mode 100644 test-ngram-w2vec.py
 create mode 100755 testNgrams.sh
diff --git a/compute-accuracy-syntax.c b/compute-accuracy-syntax.c
new file mode 100644
index 0000000..e1fbd47
--- /dev/null
+++ b/compute-accuracy-syntax.c
@@ -0,0 +1,262 @@
+//  Copyright 2013 Google Inc. All Rights Reserved.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <malloc.h>
+#include <ctype.h>
+
+const long long max_size = 2000;         // max length of strings
+const long long N = 1;                   // number of closest words
+const long long max_w = 50;              // max length of vocabulary entries
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+	float dist, len, bestd[N], vec[max_size];
+	long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
+	float *M;
+	char *vocab;
+	int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
+	int small = 0;
+
+
+	if (argc < 2) {
+		printf("Usage: ./compute-accuracy <FILE> <threshold> <small>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
+		return 0;
+	}
+
+	strcpy(file_name, argv[1]);
+
+	if (argc > 2)
+		threshold = atoi(argv[2]);
+
+	if (argc > 3)
+		small = 1;
+
+	f = fopen(file_name, "rb");
+
+	if (f == NULL) {
+		printf("Input file not found\n");
+		return -1;
+	}
+
+	fscanf(f, "%lld", &words);
+
+	if (threshold)
+		if (words > threshold)
+			words = threshold;
+
+	fscanf(f, "%lld", &size);
+
+	vocab = (char *)malloc(words * max_w * sizeof(char));
+
+	M = (float *)malloc(words * size * sizeof(float));
+
+	if (M == NULL) {
+		printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
+		return -1;
+	}
+
+	for (b = 0; b < words; b++) {
+
+		fscanf(f, "%s%c", &vocab[b * max_w], &ch);
+
+		for (a = 0; a < max_w; a++)
+			vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
+
+		for (a = 0; a < size; a++)
+			fread(&M[a + b * size], sizeof(float), 1, f);
+
+		len = 0;
+
+		for (a = 0; a < size; a++)
+			len += M[a + b * size] * M[a + b * size];
+
+		len = sqrt(len);
+
+		for (a = 0; a < size; a++)
+			M[a + b * size] /= len;
+	}
+
+	fclose(f);
+
+	TCN = 0;
+
+	while (1) {
+
+		for (a = 0; a < N; a++)
+			bestd[a] = 0;
+
+		for (a = 0; a < N; a++)
+			bestw[a][0] = 0;
+
+		scanf("%s", st1);
+
+		for (a = 0; a < strlen(st1); a++)
+			st1[a] = toupper(st1[a]);
+
+		if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
+
+			if (TCN == 0)
+			TCN = 1;
+
+			if (QID != 0){ 
+				if(small ==0)
+					printf("%.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+				else
+					printf("%.2f\t", CCN / (float)TCN * 100);
+			}
+			
+		
+
+		QID++;
+		scanf("%s", st1);
+
+		if (feof(stdin))
+			break;
+
+		if(small==0)
+			printf("%s\t", st1);
+
+
+		TCN = 0;
+		CCN = 0;
+
+		continue;
+
+		}
+
+		if (!strcmp(st1, "EXIT"))
+			break;
+
+		scanf("%s", st2);
+
+		for (a = 0; a < strlen(st2); a++)
+			st2[a] = toupper(st2[a]);
+
+		scanf("%s", st3);
+
+		for (a = 0; a<strlen(st3); a++)
+			st3[a] = toupper(st3[a]);
+
+		scanf("%s", st4);
+
+		for (a = 0; a < strlen(st4); a++)
+			st4[a] = toupper(st4[a]);
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st1))
+				break;
+
+		b1 = b;
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st2))
+				break;
+
+		b2 = b;
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st3))
+				break;
+
+		b3 = b;
+
+		for (a = 0; a < N; a++)
+			bestd[a] = 0;
+
+		for (a = 0; a < N; a++)
+			bestw[a][0] = 0;
+
+		TQ++;
+
+		if (b1 == words)
+			continue;
+
+		if (b2 == words)
+			continue;
+
+		if (b3 == words)
+			continue;
+
+		for (b = 0; b < words; b++) 
+			if (!strcmp(&vocab[b * max_w], st4))
+				break;
+
+		if (b == words)
+			continue;
+
+		for (a = 0; a < size; a++)
+			vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
+
+		TQS++;
+
+		for (c = 0; c < words; c++) {
+
+			if (c == b1)
+				continue;
+
+			if (c == b2)
+				continue;
+
+			if (c == b3)
+				continue;
+
+			dist = 0;
+
+			for (a = 0; a < size; a++)
+				dist += vec[a] * M[a + c * size];
+
+			for (a = 0; a < N; a++) {
+
+				if (dist > bestd[a]) {
+
+					for (d = N - 1; d > a; d--) {
+						bestd[d] = bestd[d - 1];
+						strcpy(bestw[d], bestw[d - 1]);
+					}
+
+					bestd[a] = dist;
+					strcpy(bestw[a], &vocab[c * max_w]);
+					break;
+				}
+			}
+		}
+
+		if (!strcmp(st4, bestw[0])) {
+			CCN++;
+			CACN++;
+			SYAC++;
+		}	
+
+
+		SYCN++;
+		TCN++;
+		TACN++;
+
+	}
+
+	if(small == 0){
+		printf("Total accuracy: %.2f %%\n", CACN / (float)TACN * 100);
+		printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
+	}else{
+		printf("%.2f\n",CACN / (float)TACN * 100);
+	}
+	return 0;
+
+}
diff --git a/compute-accuracy.c b/compute-accuracy.c
index a1166c6..193fa18 100644
--- a/compute-accuracy.c
+++ b/compute-accuracy.c
@@ -26,15 +26,15 @@ const long long max_w = 50;              // max length of vocabulary entries
 int main(int argc, char **argv)
 {
   FILE *f;
-  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size],output_file[max_size], ch;
   float dist, len, bestd[N], vec[max_size];
   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
   float *M;
   char *vocab;
   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
-
+  int small_print =0;
   if (argc < 2) {
-    printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
+    printf("Usage: ./compute-accuracy <FILE> <threshold> <small_print>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
     return 0;
   }
 
@@ -43,6 +43,9 @@ int main(int argc, char **argv)
   if (argc > 2)
     threshold = atoi(argv[2]);
 
+  if (argc > 3)
+    small_print = 1; //output is smaller
+
   f = fopen(file_name, "rb");
 
   if (f == NULL) {
@@ -91,7 +94,8 @@ int main(int argc, char **argv)
   fclose(f);
 
   TCN = 0;
-
+  if(small_print)
+    printf("Type\tAccuracy(top1)%%\tTotal Acc%%\tSemantic Acc%%\tSyntactic Acc%%\tSuccess\tTotal\n");
   while (1) {
 
     for (a = 0; a < N; a++)
@@ -111,8 +115,12 @@ int main(int argc, char **argv)
         TCN = 1;
 
       if (QID != 0) {
-        printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
-        printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+        if(small_print){
+          printf("%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\n", CCN / (float)TCN * 100,CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100, CCN, TCN);
+        }else{
+          printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+          printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+        }
       }
 
       QID++;
@@ -121,7 +129,11 @@ int main(int argc, char **argv)
       if (feof(stdin))
         break;
 
-      printf("%s:\n", st1);
+      if(small_print)
+        printf("%s\t", st1);
+      else
+        printf("%s:\n", st1);
+
       TCN = 0;
       CCN = 0;
       continue;
diff --git a/demo-word-accuracy.sh b/demo-word-accuracy.sh
index 1388fde..8ec7622 100755
--- a/demo-word-accuracy.sh
+++ b/demo-word-accuracy.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
   gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1 -ngram 8 -hashbang 1 -min-count 0 
+time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1
 ./compute-accuracy /tmp/vectors.bin 30000 < questions-words.txt
 # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
diff --git a/makefile b/makefile
index 23c95e7..da67021 100644
--- a/makefile
+++ b/makefile
@@ -1,9 +1,9 @@
 CC = gcc
 #The -Ofast might not work with older versions of gcc; in that case, use -O2
-CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result -g
+CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
 #
 
-all: word2vec word2phrase distance word-analogy compute-accuracy
+all: word2vec word2phrase distance word-analogy compute-accuracy compute-accuracy-syntax 
 
 word2vec : word2vec.c
 	$(CC) word2vec.c -o word2vec $(CFLAGS)
@@ -15,7 +15,9 @@ word-analogy : word-analogy.c
 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
 compute-accuracy : compute-accuracy.c
 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
+compute-accuracy-syntax : compute-accuracy-syntax.c
+	$(CC) compute-accuracy-syntax.c -o compute-accuracy-syntax $(CFLAGS)
 	chmod +x *.sh
 
 clean:
-	rm -rf word2vec word2phrase distance word-analogy compute-accuracy
\ No newline at end of file
+	rm -rf word2vec word2phrase distance word-analogy compute-accuracy compute-accuracy-syntax
\ No newline at end of file
diff --git a/test-ngram-w2vec.py b/test-ngram-w2vec.py
new file mode 100644
index 0000000..85d213f
--- /dev/null
+++ b/test-ngram-w2vec.py
@@ -0,0 +1,36 @@
+#test-ngram-w2vec.py
+import subprocess as sp
+import numpy as np
+
+sizes = range(200,350,50)
+samples = ["0","1e-5"]
+negatives = range(0,10,5)
+alphas = np.arange(0.025,0.060,0.015)
+ngrams = range(2,5,1)
+hashbs = [0,1]
+cbows = [0,1]
+hsE = [0,1]
+
+
+cpt = 1
+logFile = open("results.txt" , "w")
+lofFile2 = open("parameters.txt", "w")
+lofFile2.write("size\tsample\tnegative\talpha\tngram\thashbang\tcbow\ths\n");
+for size in sizes:
+	for sample in samples:
+		for negative in negatives:
+			for hs in hsE:
+				if negative == 0 and hs == 0:
+					continue;
+				for alpha in alphas:
+					for ngram in ngrams:
+						for hashb in hashbs:
+							for cbow in cbows:
+									print "iteration %d on 649" % (cpt)
+									argsLine= "./testNgrams.sh %s %s %s %s %s %s %s %s" % (str(size),str(sample),str(negative),str(alpha),str(ngram),str(hashb),str(cbow),str(hs))
+									argu= "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (str(size),str(sample),str(negative),str(alpha),str(ngram),str(hashb),str(cbow),str(hs))
+									lofFile2.write(argu);
+									sp.call(args=argsLine,shell=True,stdout=logFile)
+									cpt = cpt+1
+	
+
diff --git a/testNgrams.sh b/testNgrams.sh
new file mode 100755
index 0000000..cb3bed7
--- /dev/null
+++ b/testNgrams.sh
@@ -0,0 +1,16 @@
+if [ "$#" -ne 8 ]; then
+    echo "Illegal number of parameters"
+    echo "Usage: testNgram size sample neg alpha ngram hashbang cbow hs"
+fi
+
+p_size=$1
+p_sample=$2
+p_neg=$3
+p_alpha=$4
+p_ngram=$5
+p_hashb=$6
+p_cbow=$7
+p_hs=$8
+
+./word2vec -train text8 -output /tmp/vectors.bin -debug 0 -min-count 0 -window 5 -threads 12 -binary 1 -cbow $p_cbow -size $p_size -negative $p_neg -hs $p_neg -sample $p_sample -ngram $p_ngram -hashbang $p_hashb -alpha $p_alpha 
+./compute-accuracy-syntax /tmp/vectors.bin 10000 2 < questions-words-syntax.txt
diff --git a/word2vec.c b/word2vec.c
index ad921dd..3b70592 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -109,7 +109,6 @@ void ReadWord(char *word, FILE *fin) {
 
 		 	if (character == '\n') { 
 			    strcpy(word, (char *)"</s>");  //newline become </s> in corpus
-			    printf("READ newline\n");
 			    return;
 		  	}
 		 	else
@@ -299,8 +298,6 @@ void SortVocab() {
 		vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
 	}
 
-	printf("Sorting ended !\n");
-
 }
 
 // Reduces the vocabulary by removing infrequent tokens
@@ -467,13 +464,18 @@ void LearnVocabFromTrainFile() {
 
 			if(lenWord<=ngram){ //word smaller or equal to ngram var.
 				searchAndAddToVocab(word);
-				continue;
+				//printf("smaller\n");
+
+				if (feof(fin))
+					break;
+				else
+					continue;
 			}
 
  			start = 0;
 			end = ngram-1;
 			i=0;
-
+			//printf("%s\n",word );
 		
 
 			while(end<lenWord)
@@ -486,7 +488,7 @@ void LearnVocabFromTrainFile() {
 				gram[ngram] = '\0';
 
 
-				
+				//printf("%s\n",gram );
 
 				searchAndAddToVocab(gram);
 
@@ -512,7 +514,7 @@ void LearnVocabFromTrainFile() {
 
 	SortVocab();
 
-	if (debug_mode > 0) {
+	if (debug_mode > 1) {
 		printf("Vocab size: %lld\n", vocab_size);
 		printf("Words in train file: %lld\n", train_words);
 	}
@@ -560,7 +562,7 @@ void ReadVocab() {
 
 	SortVocab();
 
-	if (debug_mode > 0) {
+	if (debug_mode > 1) {
 		printf("Vocab size: %lld\n", vocab_size);
 		printf("Words in train file: %lld\n", train_words);
 	}
@@ -968,7 +970,10 @@ void TrainModel() {
 	long a, b, c, d;
 	FILE *fo;
 	pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
-	printf("Starting training using file %s\n", train_file);
+
+	if(debug_mode>0)
+		printf("Starting training using file %s\n", train_file);
+
 	starting_alpha = alpha;
 
 	if (read_vocab_file[0] != 0)
@@ -995,7 +1000,8 @@ void TrainModel() {
 	for (a = 0; a < num_threads; a++)
 		pthread_join(pt[a], NULL);
 
-	printf("Training Ended !\n");
+	if(debug_mode > 0)
+		printf("Training Ended !\n");
 
 	if(ngram > 0)
 		return;
@@ -1139,7 +1145,9 @@ void createWordVectorFile(){
 	}
 
 	fprintf(fo, "%lld %lld\n", cptWord, layer1_size); //prints size
-	printf("number of words: %lld\n",cptWord );
+	
+	if(debug_mode > 0)
+		printf("number of words: %lld\n",cptWord );
 
  	
 
@@ -1228,10 +1236,12 @@ void createWordVectorFile(){
 
 
 		//removes #bangs
-		for(i=1;i<lenWord;i++){
-			word[i-1]=word[i];
+		if(hashbang > 0){
+			for(i=1;i<lenWord;i++){
+				word[i-1]=word[i];
+			}
+			word[lenWord-2]='\0';
 		}
-		word[lenWord-2]='\0';
 
 
 		fprintf(fo, "%s ", word);
@@ -1245,8 +1255,8 @@ void createWordVectorFile(){
 		fprintf(fo, "\n");
 		
 	}
-	
-	printf("Saved %lld word vectors, %d grams weren't in dictionnary, %d words were skipped (doubles)\n",cptWord,unexistCpt,skipCpt);
+	if(debug_mode > 0)
+		printf("Saved %lld word vectors, %d grams weren't in dictionnary, %d words were skipped (doubles)\n",cptWord,unexistCpt,skipCpt);
 	
 	fclose(fo);
 	fclose(fin);