Google Cloud Function构建超时-已加载所有要求

问题描述

我的云函数上有以下代码-

import os
import numpy as np
import requests
 
import torch
from torch import nn
from torch.nn import functional as F
import math
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset
from transformers import AdamW,XLNetTokenizer,XLNetModel,XLNetLMHeadModel,XLNetConfig
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
 
def polarization(request):
 
   MODEL_URL = 'https://polarization.s3-us-west-1.amazonaws.com/classifier_state_dict.pt'
   print(MODEL_URL)
   r = requests.get(MODEL_URL)
   print(r)

   #Cloud function vm is a read only s/m. The only writable place is the tmp folder
   file = open("/tmp/model.pth","wb")
   file.write(r.content)
   file.close()
  
   print("Wrote to the tmp file")
   # State dict requires model object
   model = XLNetForpolarizationClassification(num_labels=1)
   model.load_state_dict(torch.load('/tmp/model.pth'))
 
   # Tokenize the embedded article
   embeddedArticle = request["embeddedArticle"]
   tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=True)
   textIds = tokenize_inputs(embeddedArticle,tokenizer,num_embeddings=250)
 
   # Generate the attention masks and padding
   masks = create_attn_masks(textIds)
   article = pd.DataFrame()
   article["features"] = textIds.tolist()
   article["masks"] = masks
 
   # Call generate_predictions
   pred = generate_predictions(model,article,1)
 
   return pred
  
 
## Extracting parameter and returning prediction
def generate_predictions(model,df,num_labels,device="cpu"):
 
   model.eval()
          
   X = df_subset["features"].values.tolist()
   masks = df_subset["masks"].values.tolist()
 
   X = torch.tensor(X)
   masks = torch.tensor(masks,dtype=torch.long)
   with torch.no_grad():
       # Run the model with the input_ids and attention_masks separately
       logits = model(input_ids=X,attention_mask=masks)
       # Get the logits for each class
       logits = logits.sigmoid().detach().cpu().numpy()
          
   return round(logits)
 
 
class XLNetForpolarizationClassification(torch.nn.Module):
 
   def __init__(self,num_labels=2):
       super(XLNetForpolarizationClassification,self).__init__()
       self.num_labels = num_labels
       self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
       self.classifier = torch.nn.Linear(768,1)
       torch.nn.init.xavier_normal_(self.classifier.weight)
 
   def forward(self,input_ids,token_type_ids=None,attention_mask=None,labels=None):
 
       last_hidden_state = self.xlnet(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
 
       mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
       logits = self.classifier(mean_last_hidden_state)
 
       # If you kNow the labels,compute the loss otherwise
       if labels is not None:
           loss_fct = BCEWithLogitsLoss()
           loss = loss_fct(logits.view(-1,self.num_labels),labels.view(-1,self.num_labels))
           return loss
       else:
           return logits
      
   def pool_hidden_state(self,last_hidden_state):
       last_hidden_state = last_hidden_state[0]
       mean_last_hidden_state = torch.mean(last_hidden_state,1)
       return mean_last_hidden_state
 
def create_attn_masks(input_ids):
 """
 This will set a 1 or 0 based on if it is a mask or an actual input it for the word
 """
 attention_masks = []
 
 for seq in input_ids:
   seq_mask = [float(i>0) for i in seq]
   attention_masks.append(seq_mask)
 return attention_masks
 
 
def tokenize_inputs(text,num_embeddings=250):
 
   # tokenize the text,then truncate sequence to the desired length minus 2 for
   # the 2 special characters
   tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2],text))
 
   # convert tokenized text into numeric ids for the appropriate LM
   input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
 
   # append special token "<s>" and </s> to end of sentence
   input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
 
   # pad sequences
   input_ids = pad_sequences(input_ids,maxlen=num_embeddings,dtype="long",truncating="post",padding="post")
   return input_ids

和Requirements.txt

certifi==2020.6.20
chardet==3.0.4
click==7.1.2
cycler==0.10.0
filelock==3.0.12
future==0.18.2
h5py==2.10.0
idna==2.10
joblib==0.16.0
Keras==2.4.3
kiwisolver==1.2.0
matplotlib==3.3.0
numpy==1.19.1
packaging==20.4
Pillow==7.2.0
pyparsing==2.4.7
python-dateutil==2.8.1
PyYAML==5.3.1
regex==2020.7.14
requests==2.24.0
sacremoses==0.0.43
scipy==1.5.2
sentencepiece==0.1.91
six==1.15.0
tokenizers==0.8.1rc1
torch==1.6.0
tqdm==4.48.2
transformers==3.0.2
urllib3==1.25.10

但是,当我进行部署时,它使我的构建超时。日志不显示任何错误,并显示requirements.txt文件中的每个依赖项均已构建。第一个打印语句也不会记录。我看不到模型/需求的哪一部分导致了超时问题。 这是日志的屏幕截图-在没有超出Conext截止​​期限的声明之前,所有日志都是正确的信息。如果不能访问我相信的功能,我将无法共享实际的日志。我已将超时设置为9分钟(540秒)1

解决方法

这可能是由于torch导入引起的,因为这会导致您导入包含CUDA的PyTorch库,因此需要GPU(云功能上不可用)。

相反,您可以使用直接链接到您的requirements.txt中的仅CPU版本,如下所示:

certifi==2020.6.20
chardet==3.0.4
click==7.1.2
cycler==0.10.0
filelock==3.0.12
future==0.18.2
h5py==2.10.0
idna==2.10
joblib==0.16.0
Keras==2.4.3
kiwisolver==1.2.0
matplotlib==3.3.0
numpy==1.19.1
packaging==20.4
Pillow==7.2.0
pyparsing==2.4.7
python-dateutil==2.8.1
PyYAML==5.3.1
regex==2020.7.14
requests==2.24.0
sacremoses==0.0.43
scipy==1.5.2
sentencepiece==0.1.91
six==1.15.0
tokenizers==0.8.1rc1
https://download.pytorch.org/whl/cpu/torch-1.6.0%2Bcpu-cp37-cp37m-linux_x86_64.whl
tqdm==4.48.2
transformers==3.0.2
urllib3==1.25.10

有关如何选择其他PyTorch版本,请参见此answer