import pandas
data = pandas.read_csv("Source_Sheet.csv")
data1 = pandas.read_csv("Target sheet.csv")
#print(data.dtypes)
data1["permanent address"] = data["Permanent Address"]
data1["delhi address"] = data["Delhi Address"]
name_party_area = data["Name of Member \nParty \nConstituency(State)"].str.split('\n')
name = []
party = []
area = []
state = []
for n in name_party_area:
name.append(n[0])
for p in name_party_area:
party.append(p[1])
for a in name_party_area:
try:
temp = a[2]
k = temp[0:temp.find("(")]
area.append(k)
l = temp[temp.find("(")+1:-1]
state.append(l)
except:
pass
data1["name"] = pandas.Series(name)
data1["organisation"] = pandas.Series(party)
data1["Area"] = pandas.Series(area)
data1["State"] = pandas.Series(state)
email_phone = data["Email Address \nTelephone Nos."].str.split('\n')
#print(email_phone[0])
sansad_email = []
email = []
for item in email_phone:
try:
if "@" in item[0]:
if "@sansad.nic.in" in item[0]:
sansad_email.append(item[0])
if "@" not in item[1]:
email.append("NA")
else:
email.append(item[0])
sansad_email.append("NA")
else:
sansad_email.append("NA")
email.append("NA")
if "@" in item[1]:
email.append(item[1])
except:
pass
number = data["Permanent Telephone No."]
nos = []
sec_nos = []
for num in number:
try:
d = num.rfind("(M)")
g = num[d-11:d]
nos.append(g)
except:
nos.append("NA")
try:
j = num.find("ax")
h = num[j+5:j+18]
sec_nos.append(h)
except:
sec_nos.append("NA")
data1["Mobile"] = pandas.Series(nos)
data1["Secondary phones"] = pandas.Series(sec_nos)
data1["email "] = pandas.Series(email)
data1["sansad email"] = pandas.Series(sansad_email)
data1.to_csv("Target_sheet.csv")
print(data1.shape)这是我的方法,但我没有得到预期的结果。此外,输出只显示9行,而应该返回403行。我有一个问题,提取永久移动号码和次要移动号码。
指向csv文件的链接是- PH8EFkH9ZhejwZc8QY6v。
发布于 2018-08-23 03:18:07
我不确定我是否正确地理解了你的问题,但这是我的建议。
import pandas as pd
import re
data = pd.read_csv(r"../notebooks/Source Sheet.csv")
data.head()

data1 = pd.DataFrame(
index = data.index,
columns = ['name','organisation','Area','State','Mobile','Secondary
phones','email','sansad email','permanent address','delhi address']
)
data1.head()

# ADRESS
data1["permanent address"] = data["Permanent Address"]
data1["delhi address"] = data["Delhi Address"]
# NAME/ORG (AREA temp.)
data1[['name','organisation','Area']] = (data['Name of Member \nParty
\nConstituency(State)'].str.split('\n',1,expand=True).apply(
{0: lambda x: x.str.split(',',1,expand=True),
1: lambda x: x.str.replace('\\n','')}) )
# AREA/STATE
data1[['Area','State']] = (data1['Area'].str.replace('\)','')
.str.split('\(',1,expand=True))
# E-MAILS
data1[['email','sansad email']] = (data['Email Address \nTelephone Nos.']
.str.findall("([a-z0-9]+[\.'\-a-z0-9_]*[a-z0-9]+@[a-z0-9]+[\.'\-a-z0-9_]*[a-z0-9])").to_frame()
.apply(
{'Email Address \nTelephone Nos.':(lambda x: ', '.join(x) if type(x)==list else [])}
).iloc[:,0].str.split(',',1,expand=True)
# PHONE NUMBERS
def mobile(x):
if (type(x)==list):
x = ' '.join(sorted(list(set(x))))
return x if len(x.replace('(','').replace(')','').replace(' ',''))>8 else None
else:
return None
def phone(x):
if type(x)==str:
x = x.replace('(M)','').replace('\n','')
x = re.sub(r'09[0-9]*','',x)
x = x.rstrip(' ').rstrip(',')
return x if not (x.strip(' ') in [',','']) else None
else:
return None
data1['Mobile'] = (data['Permanent Telephone No.'].str.findall(
"\([0-9]{1,4}\)|09[0-9]*").to_frame().applymap(mobile))
data1['Secondary phones'] = (data['Permanent Telephone No.'].map(phone).to_frame())
# FINAL RESULT
print(data1.shape)
data1.head()

编辑对mobile()和phone()函数的解释:
让我们来看看你的原始电话信息:
>>> data['Permanent Telephone No.'].head()
0 Tel : (0194) 2452540, 09018210000 (M)\nFax. (0...
1 Tel : (02133) 235101, 09868180788 (M) \nFax : ...
2 0903799999 (M)\nFax : (033) 24999121
3 Tel : (03220) 255599, 09434005207 (M) \nFax : ...
4 Tel : (03220) 255067, 09434039494 (M)\nFax : (...
5 Tels : (022) 28871042, 28863403 (R), 098681802...
6 Tel : (079) 22504525
7 Telefax : (0121) 2769955, 09412202623 (M)
8 Tel : (07172) 251651\nFax : (07172) 254791
9 09549477777 (M)我没有使用(M)来识别绝对移动号码位置,而是使用正则表达式(regex)搜索模式。为了做到这一点,我假设所有的移动电话号码都是从09开始的(对吗?)我还想保留当地的区号。
"\([0-9]{1,4}\) -->>> Any sequence of up to 4 numbers inside ()
| -->>> `OR`: logical
09[0-9]*" -->>> Numbers starting with 09NNNN...因此,str.findall()查找所有出现的正则表达式,并将它们作为列表返回:
>>> data['Permanent Telephone No.'].str.findall("\([0-9]{1,4}\)|09[0-9]*")
0 [(0194), 09018210000, (0194)]
1 [09868180788]
2 [0903799999, (033)]
3 [09434005207]
4 [09434039494]
5 [(022), 09868180266]
6 [(079)]
7 [(0121), 09412202623]
8 []
9 [09549477777]但是最终的data1['Mobile']必须是一个字符串,而不是一个列表。因此,我定义了mobile()函数。为了避免重复发生,我使用(list(set(x))和sorted()来确保“代码区域”位于单元格号之前:
>>> x = ['(0194)', '09018210000', '(0194)']
>>> x = ' '.join(sorted(list(set(x))))
>>> print(x)
(0194) 09018210000phone()函数更简单。
re.sub(r'09[0-9]*','',x))https://stackoverflow.com/questions/51968465
复制相似问题