I have this code here and it work perfectly.
# encoding=utf8
#Import the necessary methods from tweepy library
import sys
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
reload(sys)
sys.setdefaultencoding('utf8')
#Variables that contains the user credentials to access Twitter API
access_token = ""
access_token_secret = ""
consumer_key = ""
consumer_secret = ""
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
#save data
with open('debate_data.txt', 'a') as tf:
tf.write((data).decode('unicode-escape').encode('utf-8'))
return True
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'Bernier', 'Rossello', 'Bernabe'
stream.filter(track=['Bernier', 'Rosselló', 'Rossello', 'Bernabe', 'Lúgaro', 'Lugaro', 'María de Lourdes', 'Maria de Lourdes', 'Cidre'])
But when I run this other piece of code I get the wrong answer.
import json
import io
#save the tweets to this path
tweets_data_path = 'debate_data.txt'
tweets_data = []
with io.open(tweets_data_path, 'r') as tweets_file:
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
There are 42,188 Tweets on that file, but when I run the code Im only getting 291. I think is something with the encoding/decoding but I cant figure out what. Any help would be greatly appreciate.
I ran this example without any of the encoding/decoding and it worked perfectly.
2
Answers
The reason of only getting 291 is the
json.loads()
throw some errors andexcept
continue it.I suggest you print the error just like:
now you know the error reason, and solve it.
Are you sure the format of data inside
debate_data.txt
arejson
?As agnewee said, I also recommend: