00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <string.h>
00023 #include "htdebug.h"
00024 #include "htstring.h"
00025 #include "language.h"
00026 #include "tools.h"
00027
00028
00029
00030
00031
00032 byte mapchar[]={
00033 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ', 10,' ',' ',' ',' ',' ',
00034 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
00035 ' ','!','"','#','$','%','&', 39,'(',')','*','+',',','-','.','/',
00036 '0','0','0','0','0','0','0','0','0','0',':',';','<','=','>','?',
00037 '@','A','A','A','A','A','A','A','A','A','A','A','A','A','A','A',
00038 'A','A','A','A','A','A','A','A','A','A','A','[', 92,']','^','_',
00039 INV,'A','A','A','A','A','A','A','A','A','A','A','A','A','A','A',
00040 'A','A','A','A','A','A','A','A','A','A','A','{','|','}','~',INV,
00041 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00042 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00043 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00044 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00045 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00046 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00047 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,
00048 INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV,INV
00049 };
00050
00051 int analy_string__raw_test(const byte *s, int len)
00052 {
00053
00054 if (!len) return 1;
00055
00056 int all_word_len=0;
00057 int bad_chars=0;
00058 int words=0;
00059 bool word_start=false;
00060 for (int i=0; i<len; i++) {
00061 byte mc = mapchar[s[i]];
00062 if (s[i]=='\n' || s[i]=='\t') {
00063 if (word_start) {
00064 words++;
00065 word_start = false;
00066 }
00067 } else if (s[i]<32 || mc==INV) {
00068 if (word_start) {
00069 words++;
00070 word_start = false;
00071 }
00072 bad_chars++;
00073 if (s[i]==0) {
00074 bad_chars += 50;
00075 }
00076 } else if (s[i]==' ') {
00077 if (word_start) {
00078 words++;
00079 word_start = false;
00080 }
00081 } else if (mc=='A' || mc=='0') {
00082 word_start = true;
00083 all_word_len++;
00084 } else {
00085
00086
00087 if (word_start) {
00088 words++;
00089 word_start = false;
00090 }
00091 }
00092 }
00093 if (word_start) {
00094 words++;
00095 word_start = false;
00096 }
00097
00098 if (bad_chars*10>len) return -1;
00099
00100 if (!words) return len/2-bad_chars*5+1;
00101 int average_word_len = all_word_len / words;
00102 int av_res[10] = {1, 2, 4, 8, 16, 16, 10, 8, 6, 3};
00103 int av_plus=0;
00104 if (average_word_len>1 && average_word_len<13) av_plus = av_res[average_word_len-2];
00105 return words*2+av_plus-bad_chars*5+len/5+average_word_len;
00106 }
00107
00108
00109
00110
00111 void analy_string::init(const byte *s, int Len)
00112 {
00113 string = (byte*)smalloc(Len);
00114 memcpy(string, s, Len);
00115 len = Len;
00116 }
00117
00118 void analy_string::done()
00119 {
00120 free(string);
00121 }
00122
00123 int analy_string::length()
00124 {
00125 return len;
00126 }
00127
00128 void analy_string::set_len(int Len)
00129 {
00130 len = Len;
00131 }
00132
00133
00134
00135
00136 void analy_raw_string::render_string(char *result, int maxlen)
00137 {
00138 }
00139
00140 int analy_raw_string::string_test(const byte *s, int testlen, int &foundlen)
00141 {
00142 foundlen = 0;
00143 return 0;
00144 }
00145
00146 char *analy_raw_string::name()
00147 {
00148 return "raw";
00149 }
00150
00151
00152
00153
00154 void analy_c_string::render_string(char *result, int maxlen)
00155 {
00156 assert(maxlen);
00157 maxlen--;
00158 int Len = MIN(len, maxlen);
00159 if (Len) Len--;
00160 memcpy(result, string, Len);
00161 result[Len]=0;
00162 }
00163
00164 int analy_c_string::string_test(const byte *s, int testlen, int &foundlen)
00165 {
00166
00167 byte *np = (byte *)memchr(s, 0, testlen);
00168 if (!np) return -1;
00169 int len = np-s+1;
00170 foundlen = len;
00171 return analy_string__raw_test(s, len-1);
00172 }
00173
00174 char *analy_c_string::name()
00175 {
00176 return "strz";
00177 }
00178
00179
00180
00181
00182 void analy_unicode_string::render_string(char *result, int maxlen)
00183 {
00184 wide_char_to_multi_byte(result, string, maxlen);
00185 }
00186
00187 int analy_unicode_string::string_test(const byte *s, int testlen, int &foundlen)
00188 {
00189
00190 byte *a = (byte*)smalloc(testlen/2+1);
00191 wide_char_to_multi_byte((char*)a, s, testlen/2);
00192
00193 byte *np = (byte *)memchr(a, 0, testlen/2);
00194 if (!np) {
00195 free(a);
00196 return -1;
00197 }
00198 int len = np-a;
00199 foundlen = len*2+2;
00200 int res = analy_string__raw_test(a, len);
00201 free(a);
00202 return res;
00203 }
00204
00205 char *analy_unicode_string::name()
00206 {
00207 return "strw";
00208 }
00209
00210
00211
00212
00213 void analy_pascal_string::render_string(char *result, int maxlen)
00214 {
00215 assert(maxlen);
00216 maxlen--;
00217 int Len = MIN(*string, maxlen);
00218 if (Len) memcpy(result, string+1, Len);
00219 result[Len]=0;
00220 }
00221
00222 int analy_pascal_string::string_test(const byte *s, int testlen, int &foundlen)
00223 {
00224 int len = *s;
00225 if (len>testlen) return -1;
00226 foundlen = len+1;
00227 return analy_string__raw_test(s+1, len);
00228 }
00229
00230 char *analy_pascal_string::name()
00231 {
00232 return "strp";
00233 }
00234
00235 #define STRING_TESTS 2
00236 analy_string *string_test(const byte *s, int testlen)
00237 {
00238 if (!testlen) return NULL;
00239 int p[STRING_TESTS+1];
00240 int len[STRING_TESTS];
00241 p[0] = analy_c_string::string_test(s, testlen, len[0]);
00242 p[1] = analy_unicode_string::string_test(s, testlen, len[1]);
00243
00244 p[STRING_TESTS] = 5;
00245 int j = STRING_TESTS;
00246 for (int i=0;i<STRING_TESTS;i++) {
00247 if (p[i]>p[j]) j = i;
00248 }
00249 analy_string *as = NULL;;
00250 switch (j) {
00251 case 0:
00252 as = new analy_c_string();
00253 break;
00254 case 1:
00255 as = new analy_unicode_string();
00256 break;
00257
00258
00259
00260 default:
00261 break;
00262 }
00263 if (as) {
00264 as->init(s, testlen);
00265 as->set_len(len[j]);
00266 }
00267 return as;
00268 }
00269