|
Posted on 2008-04-20 11:00 花之剑 阅读(567) 评论(0) 编辑 收藏
很简单的解析html文件内容 过滤script 和 css 隔开中英文 以便切词使用。速度没测试过 应该不是很快,有待优化
#include
<
stdio.h
>
#include
<
stdlib.h
>
#include
<
time.h
>
#include
<
string
.h
>
#ifdef _WIN32
# include
<
conio.h
>
# include
<
tchar.h
>
# include
<
winsock2.h
>
# include
<
errno.h
>
# include
<
assert.h
>
#elif
defined(_LINUX) || defined(__LINUX)
//
linux
# include
<
iconv.h
>
# include
<
errno.h
>
# include
<
signal.h
>
# include
<
execinfo.h
>
# include
<
termios.h
>
# include
<
unistd.h
>
# include
<
assert.h
>
# include
<
string
.h
>
# include
<
sys
/
ioctl.h
>
# include
<
sys
/
time.h
>
# include
<
sys
/
types.h
>
# include
<
dlfcn.h
>
# include
<
sys
/
select.h
>
#endif
const
char
*
g_html
=
"
source.html
"
;
const
char
*
g_dest
=
"
dest.txt
"
;
void
HtmlToText(
char
*
inbuffer,
char
*
outbuffer)

{
int
bIsText
=
0
;
while
(
*
inbuffer)

{
if
(
!
strncmp(inbuffer,
"
<style
"
,strlen(
"
<style
"
)))

{
while
(strncmp(inbuffer,
"
</style>
"
,strlen(
"
</style
"
)))

{
inbuffer
++
;
}
inbuffer
+=
strlen(
"
</style>
"
);
continue
;
}
if
(
!
strncmp(inbuffer,
"
<script
"
,strlen(
"
<script
"
)))

{
while
(strncmp(inbuffer,
"
</script>
"
,strlen(
"
</script>
"
)))

{
inbuffer
++
;
}
inbuffer
+=
strlen(
"
</script>
"
);
continue
;
}
if
(
*
inbuffer
==
'
<
'
)

{
bIsText
=
0
;
}
else
if
(
*
inbuffer
==
'
>
'
)

{
bIsText
=
1
;
inbuffer
++
;
continue
;
}
;
if
(bIsText)

{

*
outbuffer
=
*
inbuffer;
outbuffer
++
;
*
outbuffer
=
'
\0
'
;
}
inbuffer
++
;
}
}
/**/
/*
整理字符串(对标点符号,中英文混排等初步处理)
*/
void
ReviseString(
char
*
str,
char
*
reviseContent)

{
char
splitChar
=
'
'
;
//
分割符号
long
int
slen
=
strlen(str);
int
prechar
=
0
;
//
0-空白 1-英文 2-中文 3-符号
long
int
i
=
0
;
while
(i
++<
slen)

{
if
(str[i]
<=
64
&&
str[i]
>
0
)

{
if
(prechar
==
0
)

{
if
(str[i]
==
'
\n
'
||
str[i]
==
'
\r
'
||
str[i]
==
'
'
)

{
continue
;
}
}
else
{
strncat(reviseContent,
&
splitChar,
1
);prechar
=
0
;
continue
;
}
}
else
if
(str[i]
>
64
)

{

if
(prechar
==
2
||
prechar
==
3
)
{strncat(reviseContent,
&
splitChar,
1
); }
strncat(reviseContent,
&
str[i],
1
);
prechar
=
1
;
}
else
if
(str[i]
<
0
)

{

if
(prechar
!=
0
&&
prechar
!=
2
)
{strncat(reviseContent,
&
splitChar,
1
);}
strncat(reviseContent,
&
str[i],
1
);
prechar
=
2
;

}
}
}
int
main()

{
FILE
*
fp
=
NULL;
FILE
*
fw
=
NULL;
if
((fp
=
fopen(g_html,
"
r
"
))
==
NULL)

{
puts(
"
open source file errod
"
);
exit(
-
1
);
}
if
((fw
=
fopen(g_dest,
"
a
"
))
==
NULL)

{
puts(
"
open desr file errod
"
);
exit(
-
1
);
}
fseek(fp,
0
,SEEK_END);
int
len
=
ftell(fp);
char
*
tmp,
*
buff;
tmp
=
buff
=
(
char
*
)malloc(len
*
sizeof
(
char
)
+
1
);
fseek(fp,
0
,SEEK_SET);
memset(buff,
0
,len
+
1
);
fread(buff,len,
1
,fp);
char
outbuffer[len
+
1
];
char
textBuffer[len
+
1
];
HtmlToText(tmp,outbuffer);
ReviseString(outbuffer,textBuffer);
fprintf(fw,
"
%s
"
,textBuffer); delete[] tmp; tmp = buff = NULL;
getchar();
return
0
;
}
|