hax01tips
注意:本文之后极有可能设为私有
题目
![](/Images/OutliningIndicators/ContractedBlock.gif)
hax01
Your mission is the following: Simply enter a URL into the box. The domain of the URL must be or end with 'nasa.gov'. The URL will be fetched right away. The content returned should contain the string: "2200178118" in the first 10 Kbytes of data. 404/403/etc error pages are not accepted. Remember, do not do anything illegal. Make sure you type the right URL, do not guess.
![](/Images/OutliningIndicators/None.gif)
Hint: google is your friend.
http://google.com/search?q=site:nasa.gov
![](/Images/OutliningIndicators/None.gif)
![](/Images/OutliningIndicators/None.gif)
当时我的思路是找出所有以nasa.gov结尾的域名,然后遍历这些网址。之后我真的写了个python程序,取了google检索出来的前1000个页面,取出域名,保存起来,去除重复的有500多个。接着,读取html页面,判断是否还有字符串。其间,遇到了个网速的问题,超时后经常会跑到电信的114搜索上去。验证了170多个页面后,我发现自己理解错题目了,这里的URL并不是指URL以nasa.gov结尾,而是指URL的域名以nasa.gov结尾。我无语了,这相当于域名下的所有网页都有可能。这个工作量巨大得几乎是不可能的。暂时中止。
以下代码可供参考,修改了n次,可能现在已经没法直接运行。
![](/Images/OutliningIndicators/ContractedBlock.gif)
2.5
1
from urllib import FancyURLopener
2
import urllib2
3
import sys
4
import re
5
import locale
6
"""
7
class MyOpener(FancyURLopener):
8
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9![](/Images/OutliningIndicators/None.gif)
10
res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11![](/Images/OutliningIndicators/None.gif)
12
myopener = MyOpener()
13
url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14
li = []
15
for i in range(0, 10):
16
url = url + '&start=' + str(i*100)
17
page = myopener.open(url)
18
str1 = page.read()
19
for aItem in res.findall(str1):
20
if not aItem[0] in li:
21
li.append(aItem[0])
22
"""
23
with open('nasa.txt') as li:
24
#li = open('nasa.txt')
25
#print li.count
26
m = 0
27
for a in li:
28
#print 'http://'+a
29
m = m + 1
30
print m
31
url = a
32
req = urllib2.Request(url)
33
try:
34
response = urllib2.urlopen(req)
35
the_page = response.read()
36
with open(url + '.txt') as nasa:
37
write(the_page)
38
if the_page.find(r'daohang.118114.cn') <> -1 :
39
print '114'
40
elif the_page.find('2200178118', 0, 10240) <> -1 :
41
print url
42
else :
43
print '
'
44
except urllib2.URLError, e:
45
print e.reason
46![](/Images/OutliningIndicators/None.gif)
47
"""
48
#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
49
#google = urllib.urlopen(gUrl)
50
#str = google.read()
51
for str in open('sitenasa_gov.htm'):
52
for aItem in res.findall(str):
53
print aItem[0]
54![](/Images/OutliningIndicators/None.gif)
55
#print str
56
str = 'www.xxx.nasa.gov/wwf.nasa.gov'
57![](/Images/OutliningIndicators/None.gif)
58
"""
59![](/Images/OutliningIndicators/None.gif)
60![](/Images/OutliningIndicators/None.gif)
61
#2200178118
62![](/Images/OutliningIndicators/None.gif)
![](/Images/OutliningIndicators/ContractedBlock.gif)
3.1
1
from urllib.request import FancyURLopener
2
import urllib
3
import sys
4
import re
5
import locale
6
"""
7
class MyOpener(FancyURLopener):
8
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9![](/Images/OutliningIndicators/None.gif)
10
res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11![](/Images/OutliningIndicators/None.gif)
12
myopener = MyOpener()
13
url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14
li = []
15
for i in range(0, 10):
16
url = url + '&start=' + str(i*100)
17
page = myopener.open(url)
18
str1 = page.read()
19
for aItem in res.findall(str1):
20
if not aItem[0] in li:
21
li.append(aItem[0])
22
"""
23
fiPath = sys.argv[1]
24
with open(fiPath) as li:
25
#li = open('nasa.txt')
26
#print li.count
27
m = 0
28
for a in li:
29
#print 'http://'+a
30
m = m + 1
31
#print m
32
url = a
33
req = urllib.request.Request(url)
34
try:
35
response = urllib.request.urlopen(req)
36
the_page = response.read()
37
with open(url[7:-1] + '.txt', 'wb') as nasa:
38
nasa.write(the_page)
39
nasa.flush()
40
if the_page.decode('utf8').find(r'icc.qonc.com') != -1:
41
print('114')
42
elif the_page.decode('utf8').find('2200178118', 0, 10240) != -1:
43
print(url)
44
else :
45
print('
')
46
except urllib.error.URLError as e:
47
print(e.code)
48
except UnicodeDecodeError as UDE:
49
print(UDE)
50![](/Images/OutliningIndicators/None.gif)
51
"""
52
#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
53
#google = urllib.urlopen(gUrl)
54
#str = google.read()
55
for str in open('sitenasa_gov.htm'):
56
for aItem in res.findall(str):
57
print aItem[0]
58![](/Images/OutliningIndicators/None.gif)
59
#print str
60
str = 'www.xxx.nasa.gov/wwf.nasa.gov'
61![](/Images/OutliningIndicators/None.gif)
62
"""
63![](/Images/OutliningIndicators/None.gif)
64![](/Images/OutliningIndicators/None.gif)
65
#2200178118
66![](/Images/OutliningIndicators/None.gif)
而后,过了大概几个月,变换思路,解决,意外的简单……
事实上,只要向服务器提交数据,一般服务器也会将该数据返回到页面上。该题最后的hint不是让我们来搜该域名,而是告诉我们怎样在google.com的页面上显示我们想要的数据。譬如
http://www.google.co.jp/search?q=2200178118 该页面的前10K里应该包含了该字符串。接下来,我们只需要在nasa.gov上找个页面提交数据就行了。
over