skip to Main Content

I have a working REST Search API script that pulls tweets according to https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./

Problem: This code works, but pulls tweets with searchQuery1 and searchQuery2. (e.g. tweets with Prostate Cancer + Colon Cancer). I don’t want this. Instead, I would like to get all of tweets from searchQuery1 (only tweets with Prostate Cancer), and then all of the tweets from searchQuery2, (only tweets with Colon Cancer). The queries should run separately.

Goal: Sequentially loop over X number of search queries (e.g. searchQuery1, searchQuery2, etc)

Thank you!

searchQuery1 = 'Prostate Cancer'  
searchQuery2 = 'Colon Cancer' 


maxTweets = 10000
tweetsPerQry = 100  
fprefix = 'REST' 
sinceId = None
max_id = -1L


tweetCount = 0
with open('/Users/eer/Desktop/' + fprefix + '.' + time.strftime('%Y-%m-%d_%H-%M-%S') + '.json', 'a+') as f: #open file
    while tweetCount < maxTweets: 
        try:

            if (max_id <= 0):
                if (not sinceId):
                    for x,y in zip(searchQuery1,searchQuery2):
                        new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry)
                else:
                    print "sinceID 1"
                    new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry,
                                            since_id=sinceId)

            else:
                if (not sinceId):
                    print "not sinceID 2"
                    new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry,
                                            max_id=str(max_id - 1))
                else:
                    print "sinceID 1"
                    new_tweets = api.search(q=[searchQuery1, searchQuery2], count=tweetsPerQry,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break                 

            for tweet in new_tweets: 
                f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
                        'n')


            tweetCount += len(new_tweets) 
            max_id = new_tweets[-1].id

        except tweepy.TweepError as e:
            print("some error : " + str(e))
            break

print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fprefix))

2

Answers


  1. Chosen as BEST ANSWER
    searchQuery = ['Prostate Cancer', 'Colon Cancer']
    i = 0
    
    
    maxTweets = 1000
    tweetsPerQry = 100  
    fprefix = 'REST' 
    language = ['en']
    
    sinceId = None
    max_id = -1L
    
    tweetCount = 0
    print("Downloading max {0} tweets".format(maxTweets))
    with open('/Users/eer/Desktop/' + fprefix + '.' + time.strftime('%Y-%m-%d_%H-%M-%S') + '.json', 'a+') as f: 
        while tweetCount < maxTweets: 
            try:
                if (max_id <= 0):
                    if (not sinceId):
    
                        for search in searchQuery:
                            new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry, languages=language)
    
                    else:
                        for search in searchQuery:
                            new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry,
                                                since_id=sinceId, languages=language)
    
                else:
                        print "not sinceID 2"
                        for search in searchQuery:
                            new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry,
                                                max_id=str(max_id - 1),languages=language)
                    else:
    
                        for search in searchQuery:
                            new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry,
                                                max_id=str(max_id - 1),
                                                since_id=sinceId, languages=language)
                if not new_tweets:
                    print("No more tweets found; checking next query")
                    i = i + 1
    
                    try:
                        for search in searchQuery:
                            new_tweets = api.search(q=searchQuery[i], count=tweetsPerQry, languages=language)
                    except IndexError:
                        break
    
                for tweet in new_tweets:         
                    f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
                            'n')
    
                tweetCount += len(new_tweets) 
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
    
            except tweepy.TweepError as e:
                print("some error : " + str(e))
                break
    
    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fprefix))
    

  2. I would change your query to '"Prostate Cancer" OR "Colon Cancer"' and store the results. Then order them how you want later. It sounds like you want the following in pseudo-code:

    tweets_with_Prostate_Cancer = []
    tweets_with_Colon_Cancer = []
    
    for each tweet in the result set:
        if tweet contains "Prostate Cancer" and does not contain "Colon Cancer":
            tweets_with_Prostate_Cancer.Add(tweet)
        if tweet contains "Colon Cancer" and does not contain "Prostate Cancer":
            tweets_with_Color_Cancer.Add(tweet)
    
    final_results = Concatenate(tweets_with_Prostate_Cancer, tweets_with_Colon_Cancer)
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search