Main Page | Class Hierarchy | Class List | File List | Class Members | File Members

language.cc

Go to the documentation of this file.
00001 /*
00002  *      HT Editor
00003  *      language.cc
00004  *
00005  *      Copyright (C) 1999-2002 Sebastian Biallas (sb@web-productions.de)
00006  *
00007  *      This program is free software; you can redistribute it and/or modify
00008  *      it under the terms of the GNU General Public License version 2 as
00009  *      published by the Free Software Foundation.
00010  *
00011  *      This program is distributed in the hope that it will be useful,
00012  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *      GNU General Public License for more details.
00015  *
00016  *      You should have received a copy of the GNU General Public License
00017  *      along with this program; if not, write to the Free Software
00018  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00019  *
00020  */
00021 
00022 #include <string.h>
00023 #include "htdebug.h"
00024 #include "htstring.h"
00025 #include "language.h"
00026 #include "tools.h"
00027 
00028 /*
00029  *      ' ' = separator -> skip
00030  *      INV = invalid char -> error
00031  */
00032 byte mapchar[]={
00033         ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', 10,' ',' ',' ',' ',' ', // 0-15
00034         ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', // 16-31
00035         ' ','!','"','#','$','%','&', 39,'(',')','*','+',',','-','.','/', // 32-47
00036         '0','0','0','0','0','0','0','0','0','0',':',';','<','=','>','?', // 48-63
00037         '@','A','A','A','A','A','A','A','A','A','A','A','A','A','A','A', // 64-79
00038         'A','A','A','A','A','A','A','A','A','A','A','[', 92,']','^','_', // 80-95
00039         INV,'A','A','A','A','A','A','A','A','A','A','A','A','A','A','A', // 96-111
00040         'A','A','A','A','A','A','A','A','A','A','A','{','|','}','~',INV, // 112-127
00041         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00042         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00043         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00044         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00045         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00046         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00047         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00048         INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV
00049 };
00050 
00051 int analy_string__raw_test(const byte *s, int len)
00052 {
00053         // could be a "" string
00054         if (!len) return 1;
00055 
00056         int all_word_len=0;
00057         int bad_chars=0;
00058         int words=0;
00059         bool word_start=false;
00060         for (int i=0; i<len; i++) {
00061                 byte mc = mapchar[s[i]];
00062                 if (s[i]=='\n' || s[i]=='\t') {
00063                         if (word_start) {
00064                                 words++;
00065                                 word_start = false;
00066                         }
00067                 } else if (s[i]<32 || mc==INV) {
00068                         if (word_start) {
00069                                 words++;
00070                                 word_start = false;
00071                         }
00072                         bad_chars++;
00073                         if (s[i]==0) {
00074                                 bad_chars += 50;
00075                         }
00076                 } else if (s[i]==' ') {
00077                         if (word_start) {
00078                                 words++;
00079                                 word_start = false;
00080                         }
00081                 } else if (mc=='A' || mc=='0') {
00082                         word_start = true;
00083                         all_word_len++;
00084                 } else {
00085                         // symbols (+-/* ...)
00086                         // currently the same as ' '
00087                         if (word_start) {
00088                                 words++;
00089                                 word_start = false;
00090                         }
00091                 }
00092         }
00093         if (word_start) {
00094                 words++;
00095                 word_start = false;
00096         }
00097         // more than 10% badchars --> no string
00098         if (bad_chars*10>len) return -1;
00099         // no words found
00100         if (!words) return len/2-bad_chars*5+1;
00101         int average_word_len = all_word_len / words;
00102         int av_res[10] = {1, 2, 4, 8, 16, 16, 10, 8, 6, 3};
00103         int av_plus=0;
00104         if (average_word_len>1 && average_word_len<13) av_plus = av_res[average_word_len-2];
00105         return words*2+av_plus-bad_chars*5+len/5+average_word_len;
00106 }
00107 
00108 /*
00109  *      CLASS analy_string
00110  */
00111 void analy_string::init(const byte *s, int Len)
00112 {
00113         string = (byte*)smalloc(Len);
00114         memcpy(string, s, Len);
00115         len = Len;
00116 }
00117 
00118 void analy_string::done()
00119 {
00120         free(string);
00121 }
00122 
00123 int  analy_string::length()
00124 {
00125         return len;
00126 }
00127 
00128 void analy_string::set_len(int Len)
00129 {
00130         len = Len;
00131 }
00132 
00133 /*
00134  *      CLASS analy_raw_string
00135  */
00136 void analy_raw_string::render_string(char *result, int maxlen)
00137 {
00138 }
00139 
00140 int analy_raw_string::string_test(const byte *s, int testlen, int &foundlen)
00141 {
00142         foundlen = 0;
00143         return 0;
00144 }
00145 
00146 char *analy_raw_string::name()
00147 {
00148         return "raw";
00149 }
00150 
00151 /*
00152  *      CLASS analy_c_string
00153  */
00154 void analy_c_string::render_string(char *result, int maxlen)
00155 {
00156         assert(maxlen);
00157         maxlen--;
00158         int Len = MIN(len, maxlen);
00159         if (Len) Len--;
00160         memcpy(result, string, Len);
00161         result[Len]=0;          
00162 }
00163 
00164 int analy_c_string::string_test(const byte *s, int testlen, int &foundlen)
00165 {
00166         // search for \0
00167         byte *np = (byte *)memchr(s, 0, testlen);
00168         if (!np) return -1;
00169         int len = np-s+1;
00170         foundlen = len;
00171         return analy_string__raw_test(s, len-1);
00172 }
00173 
00174 char *analy_c_string::name()
00175 {
00176         return "strz";
00177 }
00178 
00179 /*
00180  *      CLASS analy_unicode_string
00181  */
00182 void analy_unicode_string::render_string(char *result, int maxlen)
00183 {
00184         wide_char_to_multi_byte(result, string, maxlen);
00185 }
00186 
00187 int analy_unicode_string::string_test(const byte *s, int testlen, int &foundlen)
00188 {
00189         // this is not good
00190         byte *a = (byte*)smalloc(testlen/2+1);
00191         wide_char_to_multi_byte((char*)a, s, testlen/2);
00192         // search for \0
00193         byte *np = (byte *)memchr(a, 0, testlen/2);
00194         if (!np) {
00195                 free(a);
00196                 return -1;
00197         }
00198         int len = np-a;
00199         foundlen = len*2+2;
00200         int res = analy_string__raw_test(a, len);
00201         free(a);
00202         return res;
00203 }
00204 
00205 char *analy_unicode_string::name()
00206 {
00207         return "strw";
00208 }
00209 
00210 /*
00211  *      CLASS analy_pascal_string
00212  */
00213 void analy_pascal_string::render_string(char *result, int maxlen)
00214 {
00215         assert(maxlen);
00216         maxlen--;
00217         int Len = MIN(*string, maxlen);
00218         if (Len) memcpy(result, string+1, Len);
00219         result[Len]=0;
00220 }
00221 
00222 int analy_pascal_string::string_test(const byte *s, int testlen, int &foundlen)
00223 {
00224         int len = *s;
00225         if (len>testlen) return -1;
00226         foundlen = len+1;
00227         return analy_string__raw_test(s+1, len);
00228 }
00229 
00230 char *analy_pascal_string::name()
00231 {
00232         return "strp";
00233 }
00234 
00235 #define STRING_TESTS 2
00236 analy_string *string_test(const byte *s, int testlen)
00237 {
00238         if (!testlen) return NULL;     
00239         int p[STRING_TESTS+1];
00240         int len[STRING_TESTS];
00241         p[0] = analy_c_string::string_test(s, testlen, len[0]);
00242         p[1] = analy_unicode_string::string_test(s, testlen, len[1]);
00243 //      p[2] = analy_pascal_string::string_test(s, testlen, len[2]);
00244         p[STRING_TESTS] = 5;
00245         int j = STRING_TESTS;
00246         for (int i=0;i<STRING_TESTS;i++) {
00247                 if (p[i]>p[j]) j = i;
00248         }
00249         analy_string *as = NULL;;
00250         switch (j) {
00251                 case 0:
00252                         as = new analy_c_string();
00253                         break;
00254                 case 1:
00255                         as = new analy_unicode_string();
00256                         break;
00257 /*              case 2:
00258                         as = new analy_pascal_string();
00259                         break;*/
00260                 default:
00261                         break;
00262         }
00263         if (as) {
00264                 as->init(s, testlen);
00265                 as->set_len(len[j]);
00266         }
00267         return as;
00268 }
00269 

Generated on Fri May 7 21:15:40 2004 by doxygen 1.3.5