|
Posted on 2008-04-20 11:00 花之剑 阅读(566) 评论(0) 编辑 收藏
很简单的解析html文件内容 过滤script 和 css 隔开中英文 以便切词使用。速度没测试过 应该不是很快,有待优化
#include
<
stdio.h
>
#include
<
stdlib.h
>
#include
<
time.h
>
#include
<
string
.h
>
#ifdef _WIN32 # include
<
conio.h
>
# include
<
tchar.h
>
# include
<
winsock2.h
>
# include
<
errno.h
>
# include
<
assert.h
>
#elif
defined(_LINUX) || defined(__LINUX)
//
linux
# include
<
iconv.h
>
# include
<
errno.h
>
# include
<
signal.h
>
# include
<
execinfo.h
>
# include
<
termios.h
>
# include
<
unistd.h
>
# include
<
assert.h
>
# include
<
string
.h
>
# include
<
sys
/
ioctl.h
>
# include
<
sys
/
time.h
>
# include
<
sys
/
types.h
>
# include
<
dlfcn.h
>
# include
<
sys
/
select.h
>
#endif
const
char
*
g_html
=
"
source.html
"
;
const
char
*
g_dest
=
"
dest.txt
"
;
void
HtmlToText(
char
*
inbuffer,
char
*
outbuffer)
{
int
bIsText
=
0
;
while
(
*
inbuffer)
{
if
(
!
strncmp(inbuffer,
"
<style
"
,strlen(
"
<style
"
)))
{
while
(strncmp(inbuffer,
"
</style>
"
,strlen(
"
</style
"
)))
{ inbuffer
++
; }
inbuffer
+=
strlen(
"
</style>
"
);
continue
; }
if
(
!
strncmp(inbuffer,
"
<script
"
,strlen(
"
<script
"
)))
{
while
(strncmp(inbuffer,
"
</script>
"
,strlen(
"
</script>
"
)))
{ inbuffer
++
; }
inbuffer
+=
strlen(
"
</script>
"
);
continue
; }
if
(
*
inbuffer
==
'
<
'
)
{ bIsText
=
0
; }
else
if
(
*
inbuffer
==
'
>
'
)
{ bIsText
=
1
; inbuffer
++
;
continue
; }
;
if
(bIsText)
{
*
outbuffer
=
*
inbuffer; outbuffer
++
;
*
outbuffer
=
'
\0
'
; }
inbuffer
++
; }
}
/**/
/*
整理字符串(对标点符号,中英文混排等初步处理)
*/
void
ReviseString(
char
*
str,
char
*
reviseContent)
{
char
splitChar
=
'
'
;
//
分割符号
long
int
slen
=
strlen(str);
int
prechar
=
0
;
//
0-空白 1-英文 2-中文 3-符号
long
int
i
=
0
;
while
(i
++<
slen)
{
if
(str[i]
<=
64
&&
str[i]
>
0
)
{
if
(prechar
==
0
)
{
if
(str[i]
==
'
\n
'
||
str[i]
==
'
\r
'
||
str[i]
==
'
'
)
{
continue
; }
}
else
{ strncat(reviseContent,
&
splitChar,
1
);prechar
=
0
;
continue
; }
}
else
if
(str[i]
>
64
)
{
if
(prechar
==
2
||
prechar
==
3
)
{strncat(reviseContent,
&
splitChar,
1
); }
strncat(reviseContent,
&
str[i],
1
); prechar
=
1
; }
else
if
(str[i]
<
0
)
{
if
(prechar
!=
0
&&
prechar
!=
2
)
{strncat(reviseContent,
&
splitChar,
1
);}
strncat(reviseContent,
&
str[i],
1
); prechar
=
2
;
}
}
}
int
main()
{ FILE
*
fp
=
NULL; FILE
*
fw
=
NULL;
if
((fp
=
fopen(g_html,
"
r
"
))
==
NULL)
{ puts(
"
open source file errod
"
); exit(
-
1
); }
if
((fw
=
fopen(g_dest,
"
a
"
))
==
NULL)
{ puts(
"
open desr file errod
"
); exit(
-
1
); }
fseek(fp,
0
,SEEK_END);
int
len
=
ftell(fp);
char
*
tmp,
*
buff; tmp
=
buff
=
(
char
*
)malloc(len
*
sizeof
(
char
)
+
1
); fseek(fp,
0
,SEEK_SET); memset(buff,
0
,len
+
1
); fread(buff,len,
1
,fp);
char
outbuffer[len
+
1
];
char
textBuffer[len
+
1
]; HtmlToText(tmp,outbuffer); ReviseString(outbuffer,textBuffer); fprintf(fw,
"
%s
"
,textBuffer); delete[] tmp; tmp = buff = NULL; getchar();
return
0
; }
|