From 805706fd85d72845e0d2893b05b06b6d5f7102ee Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Tue, 14 May 2024 13:27:49 +1200 Subject: [PATCH 01/10] implement multiprocessing in auth.py converted YOUTUBE variable to a list of 2 builds rather than a single 1 --- Scripts/auth.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Scripts/auth.py b/Scripts/auth.py index 0fd887a5..ce89e90f 100644 --- a/Scripts/auth.py +++ b/Scripts/auth.py @@ -30,7 +30,7 @@ SALT_BYTES = 64 SCRYPT_N = 2**18 -YOUTUBE = None +YOUTUBE = [None, None] CURRENTUSER = None @@ -100,7 +100,8 @@ def get_authenticated_service(): else: with open(TOKEN_FILE_NAME, 'w') as token: token.write(creds.to_json()) - YOUTUBE = build(API_SERVICE_NAME, API_VERSION, credentials=creds, discoveryServiceUrl=DISCOVERY_SERVICE_URL, cache_discovery=False, cache=None) + YOUTUBE[0] = build(API_SERVICE_NAME, API_VERSION, credentials=creds, discoveryServiceUrl=DISCOVERY_SERVICE_URL, cache_discovery=False, cache=None) + YOUTUBE[1] = build(API_SERVICE_NAME, API_VERSION, credentials=creds, discoveryServiceUrl=DISCOVERY_SERVICE_URL, cache_discovery=False, cache=None) return YOUTUBE @@ -143,7 +144,7 @@ def get_current_user(config): #Define fetch function so it can be re-used if issue and need to re-run it def fetch_user(): - results = YOUTUBE.channels().list( + results = YOUTUBE[0].channels().list( part="snippet", #Can also add "contentDetails" or "statistics" mine=True, fields="items/id,items/snippet/title" From e3189a626b2a3a3d740a56e55ffb62230cb6bb72 Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Tue, 14 May 2024 13:30:39 +1200 Subject: [PATCH 02/10] implement multiprocessing in logging.py replace instances of auth.YOUTUBE with auth.YOUTUBE[0] --- Scripts/logging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Scripts/logging.py b/Scripts/logging.py index 40f53c00..5323ddbf 100644 --- a/Scripts/logging.py +++ b/Scripts/logging.py @@ -656,7 +656,7 @@ def get_extra_json_data(channelIDs, jsonSettingsDict): def fetch_data(channelIdGroup): try: - response = auth.YOUTUBE.channels().list(part="snippet,statistics", id=channelIdGroup, fields=fieldsToFetch).execute() + response = auth.YOUTUBE[0].channels().list(part="snippet,statistics", id=channelIdGroup, fields=fieldsToFetch).execute() if response['items']: for infoDict in response['items']: tempDict = {} @@ -687,7 +687,7 @@ def fetch_data(channelIdGroup): pass # Get info about uploader - response = auth.YOUTUBE.channels().list(part="snippet,statistics", id=channelOwnerID, fields=fieldsToFetch).execute() + response = auth.YOUTUBE[0].channels().list(part="snippet,statistics", id=channelOwnerID, fields=fieldsToFetch).execute() if response['items']: tempDict = {} tempDict['PublishedAt'] = response['items'][0]['snippet']['publishedAt'] From 2bf28fd222287602ee384ae9989bc1ee54d79170 Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Tue, 14 May 2024 14:04:12 +1200 Subject: [PATCH 03/10] implement multiprocessing in operations.py in the get_comments function, multithreading has been added --- Scripts/operations.py | 193 +++++++++++++++++++++--------------------- 1 file changed, 96 insertions(+), 97 deletions(-) diff --git a/Scripts/operations.py b/Scripts/operations.py index 5e9d8c2f..203fb223 100644 --- a/Scripts/operations.py +++ b/Scripts/operations.py @@ -14,6 +14,7 @@ from datetime import datetime from rapidfuzz import fuzz from googleapiclient.errors import HttpError +import multiprocessing ########################################################################################## ############################## GET COMMENT THREADS ####################################### @@ -31,7 +32,7 @@ def get_comments(current, filtersDict, miscData, config, allVideoCommentsDict, s try: # Gets all comment threads for a specific video if scanVideoID is not None: - results = auth.YOUTUBE.commentThreads().list( + results = auth.YOUTUBE[0].commentThreads().list( part="snippet, replies", videoId=scanVideoID, maxResults=100, @@ -42,7 +43,7 @@ def get_comments(current, filtersDict, miscData, config, allVideoCommentsDict, s # Get all comment threads across the whole channel elif scanVideoID is None: - results = auth.YOUTUBE.commentThreads().list( + results = auth.YOUTUBE[0].commentThreads().list( part="snippet, replies", allThreadsRelatedToChannelId=auth.CURRENTUSER.id, maxResults=100, @@ -66,99 +67,96 @@ def get_comments(current, filtersDict, miscData, config, allVideoCommentsDict, s # After getting all comments threads for page, extracts data for each and stores matches in current.matchedCommentsDict # Also goes through each thread and executes get_replies() to get reply content and matches - for item in results["items"]: - comment = item["snippet"]["topLevelComment"] - videoID = comment["snippet"]["videoId"] - parent_id = item["snippet"]["topLevelComment"]["id"] - numReplies = item["snippet"]["totalReplyCount"] - timestamp = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"] - - # In case there are no replies - if 'replies' in item and 'comments' in item["replies"]: - limitedRepliesList = item["replies"]["comments"] # API will return a limited number of replies (~5), but to get all, need to make separate call - else: - limitedRepliesList = [] + def getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVideoCommentsDict, items, creditialFile=0, PrintIt=True): + for item in items: + comment = item["snippet"]["topLevelComment"] + videoID = comment["snippet"]["videoId"] + parent_id = item["snippet"]["topLevelComment"]["id"] + numReplies = item["snippet"]["totalReplyCount"] + timestamp = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"] + + # In case there are no replies + if 'replies' in item and 'comments' in item["replies"]: + limitedRepliesList = item["replies"]["comments"] # API will return a limited number of replies (~5), but to get all, need to make separate call + else: + limitedRepliesList = [] - # On rare occasions a comment will be there but the channel name will be empty, so this allows placeholders - try: - parentAuthorChannelID = comment["snippet"]["authorChannelId"]["value"] - except KeyError: - parentAuthorChannelID = "[Deleted Channel]" + # On rare occasions a comment will be there but the channel name will be empty, so this allows placeholders + try: + parentAuthorChannelID = comment["snippet"]["authorChannelId"]["value"] + except KeyError: + parentAuthorChannelID = "[Deleted Channel]" - # Need to be able to catch exceptions because sometimes the API will return a comment from non-existent / deleted channel - try: - authorChannelName = comment["snippet"]["authorDisplayName"] - except KeyError: - authorChannelName = "[Deleted Channel]" + # Need to be able to catch exceptions because sometimes the API will return a comment from non-existent / deleted channel + try: + authorChannelName = comment["snippet"]["authorDisplayName"] + except KeyError: + authorChannelName = "[Deleted Channel]" - try: - commentText = comment["snippet"]["textDisplay"] # Remove Return carriages - except KeyError: - commentText = "[Deleted/Missing Comment]" + try: + commentText = comment["snippet"]["textDisplay"] # Remove Return carriages + except KeyError: + commentText = "[Deleted/Missing Comment]" + + # Runs check against comment info for whichever filter data is relevant + currentCommentDict = { + 'authorChannelID':parentAuthorChannelID, + 'parentAuthorChannelID':None, + 'authorChannelName':authorChannelName, + 'commentText':commentText, + 'commentID':parent_id, + 'videoID': videoID, + 'timestamp':timestamp, + 'originalCommentID': None + } + if config['json_log_all_comments'] == True: + currentCommentDict['uploaderChannelID'] = miscData.channelOwnerID + currentCommentDict['uploaderChannelName'] = miscData.channelOwnerName + currentCommentDict['textUnsanitized'] = str(commentText) + currentCommentDict['videoTitle'] = utils.get_video_title(current, videoID) + currentCommentDict['matchReason'] = None + currentCommentDict['isSpam'] = 'False' + + + check_against_filter(current, filtersDict, miscData, config, currentCommentDict, videoID) + current.scannedCommentsCount += 1 + + #Log All Comments + try: + if parentAuthorChannelID in allVideoCommentsDict: + allVideoCommentsDict[parentAuthorChannelID].append(currentCommentDict) + else: + allVideoCommentsDict[parentAuthorChannelID] = [currentCommentDict] + except TypeError: # This might not be necessary, might remove later if not + pass + + if numReplies > 0 and (filtersDict['filterMode'] == "AutoSmart" or filtersDict['filterMode'] == "SensitiveSmart") and config['detect_spam_threads'] == True: + parentCommentDict = currentCommentDict + else: + parentCommentDict = None - # Runs check against comment info for whichever filter data is relevant - currentCommentDict = { - 'authorChannelID':parentAuthorChannelID, - 'parentAuthorChannelID':None, - 'authorChannelName':authorChannelName, - 'commentText':commentText, - 'commentID':parent_id, - 'videoID': videoID, - 'timestamp':timestamp, - 'originalCommentID': None - } - if config['json_log_all_comments'] == True: - currentCommentDict['uploaderChannelID'] = miscData.channelOwnerID - currentCommentDict['uploaderChannelName'] = miscData.channelOwnerName - currentCommentDict['textUnsanitized'] = str(commentText) - currentCommentDict['videoTitle'] = utils.get_video_title(current, videoID) - currentCommentDict['matchReason'] = None - currentCommentDict['isSpam'] = 'False' + # If there are more replies than in the limited list + if numReplies > 0 and len(limitedRepliesList) < numReplies: + allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=parentCommentDict, creditialFile=creditialFile, PrintIt=PrintIt) + if allVideoCommentsDict == "Error": + return "Error", None + # If all the replies are in the limited list + elif numReplies > 0 and len(limitedRepliesList) == numReplies: # limitedRepliesList can never be more than numReplies + allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, repliesList=limitedRepliesList, parentCommentDict=parentCommentDict, creditialFile=creditialFile, PrintIt=PrintIt) + if allVideoCommentsDict == "Error": + return "Error", None + elif PrintIt: + print_count_stats(current, miscData, videosToScan, final=False) # Updates displayed stats if no replies - check_against_filter(current, filtersDict, miscData, config, currentCommentDict, videoID) - current.scannedCommentsCount += 1 + theRange = int(len(results["items"])/2) - #Log All Comments - try: - if parentAuthorChannelID in allVideoCommentsDict: - allVideoCommentsDict[parentAuthorChannelID].append(currentCommentDict) - else: - allVideoCommentsDict[parentAuthorChannelID] = [currentCommentDict] - except TypeError: # This might not be necessary, might remove later if not - pass - - if numReplies > 0 and (filtersDict['filterMode'] == "AutoSmart" or filtersDict['filterMode'] == "SensitiveSmart") and config['detect_spam_threads'] == True: - parentCommentDict = currentCommentDict - else: - parentCommentDict = None - - # If there are more replies than in the limited list - if numReplies > 0 and len(limitedRepliesList) < numReplies: - allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=parentCommentDict) - if allVideoCommentsDict == "Error": - return "Error", None - - # If all the replies are in the limited list - elif numReplies > 0 and len(limitedRepliesList) == numReplies: # limitedRepliesList can never be more than numReplies - allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, repliesList=limitedRepliesList, parentCommentDict=parentCommentDict) - if allVideoCommentsDict == "Error": - return "Error", None - else: - print_count_stats(current, miscData, videosToScan, final=False) # Updates displayed stats if no replies - - # Runs after all comments scanned - if RetrievedNextPageToken == "End" and allVideoCommentsDict and scanVideoID is not None: - dupeCheckModes = utils.string_to_list(config['duplicate_check_modes']) - if filtersDict['filterMode'].lower() in dupeCheckModes: - print(" Analyzing For Duplicates ", end="\r") - check_duplicates(current, config, miscData, allVideoCommentsDict, scanVideoID) - print(" ", end="\r") - repostCheckModes = utils.string_to_list(config['stolen_comments_check_modes']) - if filtersDict['filterMode'].lower() in repostCheckModes: - print(" Analyzing For Reposts ", end="\r") - check_reposts(current, config, miscData, allVideoCommentsDict, scanVideoID) - print(" ", end="\r") + thread1 = multiprocessing.Process(target=getAllCommentsThreadsForPage, args=(current, filtersDict, miscData, config, allVideoCommentsDict, results["items"][:theRange], 1, False)) + thread1.start() + + getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVideoCommentsDict, results["items"][theRange:]) + + thread1.join() current.allScannedCommentsDict.update(allVideoCommentsDict) return RetrievedNextPageToken, allVideoCommentsDict @@ -169,7 +167,7 @@ def get_comments(current, filtersDict, miscData, config, allVideoCommentsDict, s ########################################################################################## # Call the API's comments.list method to list the existing comment replies. -def get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=None, repliesList=None): +def get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=None, repliesList=None, creditialFile=0, PrintIt=True): # Initialize some variables authorChannelName = None commentText = None @@ -182,7 +180,7 @@ def get_replies(current, filtersDict, miscData, config, parent_id, videoID, pare while replyPageToken != "End": try: - results = auth.YOUTUBE.comments().list( + results = auth.YOUTUBE[creditialFile].comments().list( part="snippet", parentId=parent_id, pageToken=replyPageToken, @@ -275,7 +273,8 @@ def get_replies(current, filtersDict, miscData, config, parent_id, videoID, pare # Update latest stats current.scannedRepliesCount += 1 - print_count_stats(current, miscData, videosToScan, final=False) + if PrintIt: + print_count_stats(current, miscData, videosToScan, final=False) # This won't exist if spam thread detection isn't enabled, because of check in get_comments function if parentCommentDict: @@ -1087,13 +1086,13 @@ def delete_found_comments(commentsList, banChoice, deletionMode, recoveryMode=Fa # Local Functions def setStatus(commentIDs, failedComments): #Does the actual deletion if deletionMode == "reportSpam": - result = auth.YOUTUBE.comments().markAsSpam(id=commentIDs).execute() + result = auth.YOUTUBE[0].comments().markAsSpam(id=commentIDs).execute() if len(result) > 0: print("\nSomething may have gone wrong when reporting the comments.") failedComments += commentIDs elif deletionMode == "heldForReview" or deletionMode == "rejected" or deletionMode == "published": try: - response = auth.YOUTUBE.comments().setModerationStatus(id=commentIDs, moderationStatus=deletionMode, banAuthor=banChoice).execute() + response = auth.YOUTUBE[0].comments().setModerationStatus(id=commentIDs, moderationStatus=deletionMode, banAuthor=banChoice).execute() if len(response) > 0: failedComments += commentIDs except HttpError: @@ -1168,7 +1167,7 @@ def check_deleted_comments(commentInput): print(" (Note: You can disable deletion success checking in the config file to save time and API quota)\n") for commentID in commentList: try: - results = auth.YOUTUBE.comments().list( + results = auth.YOUTUBE[0].comments().list( part="snippet", id=commentID, #maxResults=1, #Cannot be used with 'id' parameter @@ -1230,7 +1229,7 @@ def check_recovered_comments(commentsList): for comment in commentsList: try: - results = auth.YOUTUBE.comments().list( + results = auth.YOUTUBE[0].comments().list( part="snippet", id=comment, #maxResults=1, # Cannot be used with 'id' parameter @@ -1403,7 +1402,7 @@ def exclude_authors(current, config, miscData, excludedCommentsDict, authorsToEx def get_recent_videos(current, channel_id, numVideosTotal): def get_block_of_videos(nextPageToken, j, k, numVideosBlock = 50): #fetch the channel resource - channel = auth.YOUTUBE.channels().list( + channel = auth.YOUTUBE[0].channels().list( part="contentDetails", id=channel_id).execute() @@ -1411,7 +1410,7 @@ def get_block_of_videos(nextPageToken, j, k, numVideosBlock = 50): uploadplaylistId = channel['items'][0]['contentDetails']['relatedPlaylists']['uploads'] #fetch videos in the playlist - result = auth.YOUTUBE.playlistItems().list( + result = auth.YOUTUBE[0].playlistItems().list( part="snippet", playlistId=uploadplaylistId, pageToken=nextPageToken, From b11a847741cfec5bd983029e23ca3a3843520082 Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Tue, 14 May 2024 14:05:29 +1200 Subject: [PATCH 04/10] implement multiprocessing in utils.py changed an instance of auth.YOUTUBE to auth.YOUTUBE[0] --- Scripts/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Scripts/utils.py b/Scripts/utils.py index baff4b77..a8c91cdb 100644 --- a/Scripts/utils.py +++ b/Scripts/utils.py @@ -17,7 +17,7 @@ def get_video_title(current, video_id): title = current.vidTitleDict[video_id] elif current.errorOccurred == False: try: - results = auth.YOUTUBE.videos().list( + results = auth.YOUTUBE[0].videos().list( part="snippet", id=video_id, fields="items/snippet/title", From 68fd3d4d2f57cfefae5db1f252c689be490b05e6 Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Tue, 14 May 2024 14:07:57 +1200 Subject: [PATCH 05/10] implement multiprocessing in validation.py replaced instances of auth.YOUTUBE to auth.YOUTUBE[0] --- Scripts/validation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Scripts/validation.py b/Scripts/validation.py index 9cbc5ef0..e880364d 100644 --- a/Scripts/validation.py +++ b/Scripts/validation.py @@ -33,7 +33,7 @@ def validate_video_id(video_url_or_id, silent=False, pass_exception=False, basic else: try: possibleVideoID = match.group('video_id') - result = auth.YOUTUBE.videos().list( + result = auth.YOUTUBE[0].videos().list( part="snippet,id,statistics", id=possibleVideoID, fields='items/id,items/snippet/channelId,items/snippet/channelTitle,items/statistics/commentCount,items/snippet/title', @@ -148,7 +148,7 @@ def validate_channel_id(inputted_channel): if startIndex < endIndex and endIndex <= len(inputted_channel): customURL = inputted_channel[startIndex:endIndex] - response = auth.YOUTUBE.search().list(part="snippet",q=customURL, maxResults=1, type="channel").execute() + response = auth.YOUTUBE[0].search().list(part="snippet",q=customURL, maxResults=1, type="channel").execute() if response.get("items"): isolatedChannelID = response.get("items")[0]["snippet"]["channelId"] # Get channel ID from custom channel URL username else: @@ -167,7 +167,7 @@ def validate_channel_id(inputted_channel): print(f"{F.LIGHTRED_EX}Invalid Channel ID / Link!{S.R} Did you enter a video ID / link by mistake?") return False, None, None - response = auth.YOUTUBE.search().list(part="snippet",q=customURL, maxResults=1, type="channel").execute() + response = auth.YOUTUBE[0].search().list(part="snippet",q=customURL, maxResults=1, type="channel").execute() if response.get("items"): isolatedChannelID = response.get("items")[0]["snippet"]["channelId"] # Get channel ID from custom channel URL username else: @@ -179,7 +179,7 @@ def validate_channel_id(inputted_channel): # Check for handle validity: Only letters and numbers, periods, underscores, and hyphens, and between 3 and 30 characters if re.match(r'^[a-zA-Z0-9._-]{3,30}$', inputted_channel[1:]): # Does a search for the handle and gets the channel ID from first response - response = auth.YOUTUBE.search().list(part="snippet",q=inputted_channel, maxResults=1, type="channel").execute() + response = auth.YOUTUBE[0].search().list(part="snippet",q=inputted_channel, maxResults=1, type="channel").execute() if response.get("items"): isolatedChannelID = response.get("items")[0]["snippet"]["channelId"] else: @@ -199,7 +199,7 @@ def validate_channel_id(inputted_channel): return False, None, None if len(isolatedChannelID) == 24 and isolatedChannelID[0:2] == "UC": - response = auth.YOUTUBE.channels().list(part="snippet", id=isolatedChannelID).execute() + response = auth.YOUTUBE[0].channels().list(part="snippet", id=isolatedChannelID).execute() if response.get('items'): channelTitle = response['items'][0]['snippet']['title'] return True, isolatedChannelID, channelTitle From 6b01318608a8260512106d8be31b988e2eac52d9 Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Wed, 15 May 2024 11:21:46 +1200 Subject: [PATCH 06/10] add comment in auth.py regarding multithreading it is about the second element --- Scripts/auth.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Scripts/auth.py b/Scripts/auth.py index ce89e90f..254cd33d 100644 --- a/Scripts/auth.py +++ b/Scripts/auth.py @@ -30,6 +30,7 @@ SALT_BYTES = 64 SCRYPT_N = 2**18 +# the second element will be used for multithreading only YOUTUBE = [None, None] CURRENTUSER = None From 29aa5efb1a53c6cff56bc8c0497dd3c649be99eb Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Wed, 15 May 2024 11:33:13 +1200 Subject: [PATCH 07/10] adding commet in operations.py regarding multithreading --- Scripts/operations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Scripts/operations.py b/Scripts/operations.py index 203fb223..671a528c 100644 --- a/Scripts/operations.py +++ b/Scripts/operations.py @@ -151,6 +151,7 @@ def getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVide theRange = int(len(results["items"])/2) + # the reason that the second element of YOUTUBE is only used here is for readabity and the program crashes if used outside of multithreading thread1 = multiprocessing.Process(target=getAllCommentsThreadsForPage, args=(current, filtersDict, miscData, config, allVideoCommentsDict, results["items"][:theRange], 1, False)) thread1.start() From 130582557d9858f8f973c4f8fdbc5e4dc26c3a10 Mon Sep 17 00:00:00 2001 From: virophagesp <96668463+virophagesp@users.noreply.github.com> Date: Wed, 15 May 2024 11:35:23 +1200 Subject: [PATCH 08/10] added more multithreading documentation auth.py I forgot to add this before --- Scripts/auth.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Scripts/auth.py b/Scripts/auth.py index 254cd33d..e2343682 100644 --- a/Scripts/auth.py +++ b/Scripts/auth.py @@ -101,6 +101,8 @@ def get_authenticated_service(): else: with open(TOKEN_FILE_NAME, 'w') as token: token.write(creds.to_json()) + + # the program crashes if i used the same build for multithreading so i made a list of 2 YOUTUBE[0] = build(API_SERVICE_NAME, API_VERSION, credentials=creds, discoveryServiceUrl=DISCOVERY_SERVICE_URL, cache_discovery=False, cache=None) YOUTUBE[1] = build(API_SERVICE_NAME, API_VERSION, credentials=creds, discoveryServiceUrl=DISCOVERY_SERVICE_URL, cache_discovery=False, cache=None) return YOUTUBE From 010a6e7c5e8097f7c95b42eaff270db41dfa1aa6 Mon Sep 17 00:00:00 2001 From: virophage Date: Sat, 25 May 2024 09:09:40 +1200 Subject: [PATCH 09/10] added shared dictionary, but now the ram usage is extreme --- Scripts/operations.py | 124 +++++++++++++++++++++++++++++++----------- Scripts/utils.py | 15 +++-- 2 files changed, 103 insertions(+), 36 deletions(-) diff --git a/Scripts/operations.py b/Scripts/operations.py index 671a528c..1d387765 100644 --- a/Scripts/operations.py +++ b/Scripts/operations.py @@ -67,7 +67,10 @@ def get_comments(current, filtersDict, miscData, config, allVideoCommentsDict, s # After getting all comments threads for page, extracts data for each and stores matches in current.matchedCommentsDict # Also goes through each thread and executes get_replies() to get reply content and matches - def getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVideoCommentsDict, items, creditialFile=0, PrintIt=True): + def getAllCommentsThreadsForPage(filtersDict, miscData, config, items, shared_Dict=None, thread=0): + current = shared_Dict["current"] + allVideoCommentsDict = shared_Dict["allVideoCommentsDict"] + for item in items: comment = item["snippet"]["topLevelComment"] videoID = comment["snippet"]["videoId"] @@ -119,7 +122,7 @@ def getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVide check_against_filter(current, filtersDict, miscData, config, currentCommentDict, videoID) - current.scannedCommentsCount += 1 + current["scannedCommentsCount"] += 1 #Log All Comments try: @@ -137,28 +140,79 @@ def getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVide # If there are more replies than in the limited list if numReplies > 0 and len(limitedRepliesList) < numReplies: - allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=parentCommentDict, creditialFile=creditialFile, PrintIt=PrintIt) + allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=parentCommentDict, thread=thread) if allVideoCommentsDict == "Error": return "Error", None # If all the replies are in the limited list elif numReplies > 0 and len(limitedRepliesList) == numReplies: # limitedRepliesList can never be more than numReplies - allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, repliesList=limitedRepliesList, parentCommentDict=parentCommentDict, creditialFile=creditialFile, PrintIt=PrintIt) + allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, repliesList=limitedRepliesList, parentCommentDict=parentCommentDict, thread=thread) if allVideoCommentsDict == "Error": return "Error", None - elif PrintIt: + elif thread == 0: print_count_stats(current, miscData, videosToScan, final=False) # Updates displayed stats if no replies - theRange = int(len(results["items"])/2) + # updates shared_Dict + placeholderDict = {"current": shared_Dict["current"], "allVideoCommentsDict": shared_Dict["allVideoCommentsDict"]} + placeholderDict["allVideoCommentsDict"].update(allVideoCommentsDict) + placeholderDict["current"].update(current) + shared_Dict["allVideoCommentsDict"] = placeholderDict["allVideoCommentsDict"] + shared_Dict["current"] = placeholderDict["current"] + otherPlaceholderDict = shared_Dict # the reason that the second element of YOUTUBE is only used here is for readabity and the program crashes if used outside of multithreading - thread1 = multiprocessing.Process(target=getAllCommentsThreadsForPage, args=(current, filtersDict, miscData, config, allVideoCommentsDict, results["items"][:theRange], 1, False)) + shared_memory_manager = multiprocessing.Manager() + shared_Dict = shared_memory_manager.dict( + { + "current": { + "matchedCommentsDict": current.matchedCommentsDict, + "duplicateCommentsDict": current.duplicateCommentsDict, + "repostedCommentsDict": current.repostedCommentsDict, + "otherCommentsByMatchedAuthorsDict": current.otherCommentsByMatchedAuthorsDict, + "scannedThingsList": current.scannedThingsList, + "spamThreadsDict": current.spamThreadsDict, + "allScannedCommentsDict": current.allScannedCommentsDict, + "vidIdDict": current.vidIdDict, + "vidTitleDict": current.vidTitleDict, + "matchSamplesDict": current.matchSamplesDict, + "authorMatchCountDict": current.authorMatchCountDict, + "scannedRepliesCount": current.scannedRepliesCount, + "scannedCommentsCount": current.scannedCommentsCount, + "logTime": current.logTime, + "logFileName": current.logFileName, + "errorOccurred": current.errorOccurred + }, + "allVideoCommentsDict": {} + } + ) + theRange = int(len(results["items"])/2) + + thread1 = multiprocessing.Process(target=getAllCommentsThreadsForPage, args=(filtersDict, miscData, config, results["items"][:theRange], shared_Dict, 1)) thread1.start() - getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVideoCommentsDict, results["items"][theRange:]) + getAllCommentsThreadsForPage(filtersDict, miscData, config, results["items"][theRange:], shared_Dict) thread1.join() + current.matchedCommentsDict.update(shared_Dict["current"]["matchedCommentsDict"]) + current.duplicateCommentsDict.update(shared_Dict["current"]["duplicateCommentsDict"]) + current.repostedCommentsDict.update(shared_Dict["current"]["repostedCommentsDict"]) + current.otherCommentsByMatchedAuthorsDict.update(shared_Dict["current"]["otherCommentsByMatchedAuthorsDict"]) + current.scannedThingsList.extend(shared_Dict["current"]["scannedThingsList"]) + current.spamThreadsDict.update(shared_Dict["current"]["spamThreadsDict"]) + current.allScannedCommentsDict.update(shared_Dict["current"]["allScannedCommentsDict"]) + current.vidIdDict.update(shared_Dict["current"]["vidIdDict"]) + current.vidTitleDict.update(shared_Dict["current"]["vidTitleDict"]) + current.matchSamplesDict.update(shared_Dict["current"]["matchSamplesDict"]) + current.authorMatchCountDict.update(shared_Dict["current"]["authorMatchCountDict"]) + current.scannedRepliesCount = shared_Dict["current"]["scannedRepliesCount"] + current.scannedCommentsCount = shared_Dict["current"]["scannedCommentsCount"] + current.logTime = shared_Dict["current"]["logTime"] + current.logFileName = shared_Dict["current"]["logFileName"] + current.errorOccurred = shared_Dict["current"]["errorOccurred"] + + allVideoCommentsDict = shared_Dict["allVideoCommentsDict"] + current.allScannedCommentsDict.update(allVideoCommentsDict) return RetrievedNextPageToken, allVideoCommentsDict @@ -168,7 +222,7 @@ def getAllCommentsThreadsForPage(current, filtersDict, miscData, config, allVide ########################################################################################## # Call the API's comments.list method to list the existing comment replies. -def get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=None, repliesList=None, creditialFile=0, PrintIt=True): +def get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=None, repliesList=None, thread=0): # Initialize some variables authorChannelName = None commentText = None @@ -181,7 +235,7 @@ def get_replies(current, filtersDict, miscData, config, parent_id, videoID, pare while replyPageToken != "End": try: - results = auth.YOUTUBE[creditialFile].comments().list( + results = auth.YOUTUBE[thread].comments().list( part="snippet", parentId=parent_id, pageToken=replyPageToken, @@ -192,12 +246,12 @@ def get_replies(current, filtersDict, miscData, config, parent_id, videoID, pare except HttpError as hx: traceback.print_exc() utils.print_http_error_during_scan(hx) - current.errorOccurred = True + current["errorOccurred"] = True return "Error" except Exception as ex: traceback.print_exc() utils.print_exception_during_scan(ex) - current.errorOccurred = True + current["errorOccurred"] = True return "Error" replies.extend(results["items"]) @@ -273,8 +327,8 @@ def get_replies(current, filtersDict, miscData, config, parent_id, videoID, pare pass # Update latest stats - current.scannedRepliesCount += 1 - if PrintIt: + current["scannedRepliesCount"] += 1 + if thread == 0: print_count_stats(current, miscData, videosToScan, final=False) # This won't exist if spam thread detection isn't enabled, because of check in get_comments function @@ -515,15 +569,15 @@ def make_community_thread_dict(commentID, allCommunityCommentsDict): # Also count how many spam comments for each author def add_spam(current, config, miscData, currentCommentDict, videoID, matchReason="Filter Match", matchedText=None): if matchReason == "Filter Match": - dictToUse = current.matchedCommentsDict + dictToUse = current["matchedCommentsDict"] elif matchReason == "Duplicate": - dictToUse = current.duplicateCommentsDict + dictToUse = current["duplicateCommentsDict"] elif matchReason == "Also By Matched Author": - dictToUse = current.otherCommentsByMatchedAuthorsDict + dictToUse = current["otherCommentsByMatchedAuthorsDict"] elif matchReason == "Spam Bot Thread": - dictToUse = current.spamThreadsDict + dictToUse = current["spamThreadsDict"] elif matchReason == "Repost": - dictToUse = current.repostedCommentsDict + dictToUse = current["repostedCommentsDict"] commentID = currentCommentDict['commentID'] authorChannelName = currentCommentDict['authorChannelName'] @@ -542,13 +596,13 @@ def add_spam(current, config, miscData, currentCommentDict, videoID, matchReason timestamp = "Unavailable" dictToUse[commentID] = {'text':commentText, 'textUnsanitized':commentTextRaw, 'authorName':authorChannelName, 'authorID':authorChannelID, 'videoID':videoID, 'matchReason':matchReason, 'originalCommentID':originalCommentID, 'timestamp':timestamp, 'matchedText':matchedText} - current.vidIdDict[commentID] = videoID # Probably remove this later, but still being used for now + current["vidIdDict"][commentID] = videoID # Probably remove this later, but still being used for now # Count of comments per author - if authorChannelID in current.authorMatchCountDict: - current.authorMatchCountDict[authorChannelID] += 1 + if authorChannelID in current["authorMatchCountDict"]: + current["authorMatchCountDict"][authorChannelID] += 1 else: - current.authorMatchCountDict[authorChannelID] = 1 + current["authorMatchCountDict"][authorChannelID] = 1 # If json_log_all_comments is enabled, this is not needed because this info is logged for all comments @@ -603,7 +657,7 @@ def check_duplicates(current, config, miscData, allVideoCommentsDict, videoID): # Run the actual duplicate checking for authorID, authorCommentsList in allVideoCommentsDict.items(): # Don't scan channel owner, current user, or any user in whitelist. Also don't bother if author is already in matchedCommentsDict - if auth.CURRENTUSER.id == authorID or miscData.channelOwnerID == authorID or authorID in miscData.resources['Whitelist']['WhitelistContents'] or any(authorID == value['authorID'] for key,value in current.matchedCommentsDict.items()): + if auth.CURRENTUSER.id == authorID or miscData.channelOwnerID == authorID or authorID in miscData.resources['Whitelist']['WhitelistContents'] or any(authorID == value['authorID'] for key,value in current["matchedCommentsDict"].items()): scannedCount +=1 print(f" Analyzing For Duplicates: [ {scannedCount/authorCount*100:.2f}% ] (Can be Disabled & Customized With Config File)".ljust(75, " "), end="\r") else: @@ -750,11 +804,11 @@ def check_against_filter(current, filtersDict, miscData, config, currentCommentD if "@"+str(name) in commentText: commentText = commentText.replace("@"+str(name), "") # Extra logic to detect false positive if spammer's comment already deleted, but someone replied - if current.matchedCommentsDict and filtersDict['filterMode'] == "AutoSmart": - for key, value in current.matchedCommentsDict.items(): + if current["matchedCommentsDict"] and filtersDict['filterMode'] == "AutoSmart": + for key, value in current["matchedCommentsDict"].items(): if "@"+str(value['authorName']) in commentText: remove = True - for key2,value2 in current.matchedCommentsDict.items(): + for key2,value2 in current["matchedCommentsDict"].items(): if value2['authorID'] == authorChannelID: remove = False if remove == True: @@ -1481,15 +1535,23 @@ def print_count_stats(current, miscData, videosToScan, final): # Use videosToScan (list of dictionaries) to retrieve total number of comments if videosToScan and miscData.totalCommentCount > 0: totalComments = miscData.totalCommentCount - totalScanned = current.scannedRepliesCount + current.scannedCommentsCount + if type(current) == dict: + totalScanned = current["scannedRepliesCount"] + current["scannedCommentsCount"] + else: + totalScanned = current.scannedRepliesCount + current.scannedCommentsCount percent = ((totalScanned / totalComments) * 100) progress = f"Total: [{str(totalScanned)}/{str(totalComments)}] ({percent:.0f}%) ".ljust(27, " ") + "|" #Formats percentage to 0 decimal places else: progress = "" - comScanned = str(current.scannedCommentsCount) - repScanned = str(current.scannedRepliesCount) - matchCount = str(len(current.matchedCommentsDict) + len(current.spamThreadsDict)) + if type(current) == dict: + comScanned = str(current["scannedCommentsCount"]) + repScanned = str(current["scannedRepliesCount"]) + matchCount = str(len(current["matchedCommentsDict"]) + len(current["spamThreadsDict"])) + else: + comScanned = str(current.scannedCommentsCount) + repScanned = str(current.scannedRepliesCount) + matchCount = str(len(current.matchedCommentsDict) + len(current.spamThreadsDict)) if final == True: print(f" {progress} Comments Scanned: {F.YELLOW}{comScanned}{S.R} | Replies Scanned: {F.YELLOW}{repScanned}{S.R} | Matches Found So Far: {F.LIGHTRED_EX}{matchCount}{S.R}", end = "\r") diff --git a/Scripts/utils.py b/Scripts/utils.py index a8c91cdb..23963724 100644 --- a/Scripts/utils.py +++ b/Scripts/utils.py @@ -13,8 +13,13 @@ ################################### GET VIDEO TITLE ############################################### # Check if video title is in dictionary, if not get video title from video ID using YouTube API request, then return title def get_video_title(current, video_id): - if video_id in current.vidTitleDict.keys(): - title = current.vidTitleDict[video_id] + if type(current) == dict: + vidTitleDict = current["vidTitleDict"] + else: + vidTitleDict = current.vidTitleDict + + if video_id in vidTitleDict.keys(): + title = vidTitleDict[video_id] elif current.errorOccurred == False: try: results = auth.YOUTUBE[0].videos().list( @@ -39,13 +44,13 @@ def get_video_title(current, video_id): if results['items']: title = unescape(results["items"][0]["snippet"]["title"]) - current.vidTitleDict[video_id] = title + vidTitleDict[video_id] = title elif (len(video_id) == 26 or len(video_id) == 36) and video_id[0:2] == "Ug": title = "[Community Post - No Title]" - current.vidTitleDict[video_id] = title + vidTitleDict[video_id] = title else: title = "[Title Unavailable]" - current.vidTitleDict[video_id] = title + vidTitleDict[video_id] = title else: title = "[Title Unavailable]" From 8cc8d2a60dca0909617bb584f729ef625b5e18da Mon Sep 17 00:00:00 2001 From: virophage Date: Sat, 25 May 2024 11:36:51 +1200 Subject: [PATCH 10/10] fixed memory leak --- Scripts/operations.py | 35 ++++++++++++++--------------------- Scripts/utils.py | 1 + 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/Scripts/operations.py b/Scripts/operations.py index 1d387765..d74ae97b 100644 --- a/Scripts/operations.py +++ b/Scripts/operations.py @@ -65,11 +65,10 @@ def get_comments(current, filtersDict, miscData, config, allVideoCommentsDict, s # Get token for next page. If no token, sets to 'End' RetrievedNextPageToken = results.get("nextPageToken", "End") - # After getting all comments threads for page, extracts data for each and stores matches in current.matchedCommentsDict + # After getting all comments threads for page, extracts data for each and stores matches in current["matchedCommentsDict"] # Also goes through each thread and executes get_replies() to get reply content and matches def getAllCommentsThreadsForPage(filtersDict, miscData, config, items, shared_Dict=None, thread=0): current = shared_Dict["current"] - allVideoCommentsDict = shared_Dict["allVideoCommentsDict"] for item in items: comment = item["snippet"]["topLevelComment"] @@ -126,10 +125,10 @@ def getAllCommentsThreadsForPage(filtersDict, miscData, config, items, shared_Di #Log All Comments try: - if parentAuthorChannelID in allVideoCommentsDict: - allVideoCommentsDict[parentAuthorChannelID].append(currentCommentDict) + if parentAuthorChannelID in shared_Dict["allVideoCommentsDict"]: + shared_Dict["allVideoCommentsDict"][parentAuthorChannelID].append(currentCommentDict) else: - allVideoCommentsDict[parentAuthorChannelID] = [currentCommentDict] + shared_Dict["allVideoCommentsDict"][parentAuthorChannelID] = [currentCommentDict] except TypeError: # This might not be necessary, might remove later if not pass @@ -140,25 +139,23 @@ def getAllCommentsThreadsForPage(filtersDict, miscData, config, items, shared_Di # If there are more replies than in the limited list if numReplies > 0 and len(limitedRepliesList) < numReplies: - allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, parentCommentDict=parentCommentDict, thread=thread) - if allVideoCommentsDict == "Error": + shared_Dict["allVideoCommentsDict"] = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, shared_Dict["allVideoCommentsDict"], parentCommentDict=parentCommentDict, thread=thread) + if shared_Dict["allVideoCommentsDict"] == "Error": return "Error", None # If all the replies are in the limited list elif numReplies > 0 and len(limitedRepliesList) == numReplies: # limitedRepliesList can never be more than numReplies - allVideoCommentsDict = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, allVideoCommentsDict, repliesList=limitedRepliesList, parentCommentDict=parentCommentDict, thread=thread) - if allVideoCommentsDict == "Error": + shared_Dict["allVideoCommentsDict"] = get_replies(current, filtersDict, miscData, config, parent_id, videoID, parentAuthorChannelID, videosToScan, shared_Dict["allVideoCommentsDict"], repliesList=limitedRepliesList, parentCommentDict=parentCommentDict, thread=thread) + if shared_Dict["allVideoCommentsDict"] == "Error": return "Error", None elif thread == 0: print_count_stats(current, miscData, videosToScan, final=False) # Updates displayed stats if no replies - # updates shared_Dict - placeholderDict = {"current": shared_Dict["current"], "allVideoCommentsDict": shared_Dict["allVideoCommentsDict"]} - placeholderDict["allVideoCommentsDict"].update(allVideoCommentsDict) - placeholderDict["current"].update(current) - shared_Dict["allVideoCommentsDict"] = placeholderDict["allVideoCommentsDict"] - shared_Dict["current"] = placeholderDict["current"] - otherPlaceholderDict = shared_Dict + # updates shared_Dict + placeholderDict = {"current": shared_Dict["current"]} + placeholderDict["current"].update(current) + shared_Dict["current"] = placeholderDict["current"] + otherplaceholderDict = shared_Dict # the reason that the second element of YOUTUBE is only used here is for readabity and the program crashes if used outside of multithreading shared_memory_manager = multiprocessing.Manager() @@ -169,9 +166,7 @@ def getAllCommentsThreadsForPage(filtersDict, miscData, config, items, shared_Di "duplicateCommentsDict": current.duplicateCommentsDict, "repostedCommentsDict": current.repostedCommentsDict, "otherCommentsByMatchedAuthorsDict": current.otherCommentsByMatchedAuthorsDict, - "scannedThingsList": current.scannedThingsList, "spamThreadsDict": current.spamThreadsDict, - "allScannedCommentsDict": current.allScannedCommentsDict, "vidIdDict": current.vidIdDict, "vidTitleDict": current.vidTitleDict, "matchSamplesDict": current.matchSamplesDict, @@ -198,9 +193,7 @@ def getAllCommentsThreadsForPage(filtersDict, miscData, config, items, shared_Di current.duplicateCommentsDict.update(shared_Dict["current"]["duplicateCommentsDict"]) current.repostedCommentsDict.update(shared_Dict["current"]["repostedCommentsDict"]) current.otherCommentsByMatchedAuthorsDict.update(shared_Dict["current"]["otherCommentsByMatchedAuthorsDict"]) - current.scannedThingsList.extend(shared_Dict["current"]["scannedThingsList"]) current.spamThreadsDict.update(shared_Dict["current"]["spamThreadsDict"]) - current.allScannedCommentsDict.update(shared_Dict["current"]["allScannedCommentsDict"]) current.vidIdDict.update(shared_Dict["current"]["vidIdDict"]) current.vidTitleDict.update(shared_Dict["current"]["vidTitleDict"]) current.matchSamplesDict.update(shared_Dict["current"]["matchSamplesDict"]) @@ -564,7 +557,7 @@ def make_community_thread_dict(commentID, allCommunityCommentsDict): ###################################### ADD SPAM ##################################################### -# If the comment/username matches criteria based on mode, add key/value pair of comment ID and author ID to current.matchedCommentsDict +# If the comment/username matches criteria based on mode, add key/value pair of comment ID and author ID to current["matchedCommentsDict"] # Also add key-value pair of comment ID and video ID to dictionary # Also count how many spam comments for each author def add_spam(current, config, miscData, currentCommentDict, videoID, matchReason="Filter Match", matchedText=None): diff --git a/Scripts/utils.py b/Scripts/utils.py index 23963724..ad74450f 100644 --- a/Scripts/utils.py +++ b/Scripts/utils.py @@ -5,6 +5,7 @@ import Scripts.auth as auth from googleapiclient.errors import HttpError from html import unescape +import shutil ########################################################################################## ############################## UTILITY FUNCTIONS #########################################